summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt9
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/dir.c223
-rw-r--r--fs/afs/dir_edit.c18
-rw-r--r--fs/afs/dir_silly.c11
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/internal.h15
-rw-r--r--fs/afs/main.c4
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/protocol_yfs.h3
-rw-r--r--fs/afs/rotate.c17
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/yfsclient.c249
-rw-r--r--fs/aio.c2
-rw-r--r--fs/bcachefs/Kconfig121
-rw-r--r--fs/bcachefs/Makefile107
-rw-r--r--fs/bcachefs/acl.c445
-rw-r--r--fs/bcachefs/acl.h60
-rw-r--r--fs/bcachefs/alloc_background.c2680
-rw-r--r--fs/bcachefs/alloc_background.h361
-rw-r--r--fs/bcachefs/alloc_background_format.h95
-rw-r--r--fs/bcachefs/alloc_foreground.c1683
-rw-r--r--fs/bcachefs/alloc_foreground.h318
-rw-r--r--fs/bcachefs/alloc_types.h121
-rw-r--r--fs/bcachefs/async_objs.c132
-rw-r--r--fs/bcachefs/async_objs.h44
-rw-r--r--fs/bcachefs/async_objs_types.h25
-rw-r--r--fs/bcachefs/backpointers.c1391
-rw-r--r--fs/bcachefs/backpointers.h200
-rw-r--r--fs/bcachefs/bbpos.h37
-rw-r--r--fs/bcachefs/bbpos_types.h18
-rw-r--r--fs/bcachefs/bcachefs.h1295
-rw-r--r--fs/bcachefs/bcachefs_format.h1545
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h473
-rw-r--r--fs/bcachefs/bkey.c1112
-rw-r--r--fs/bcachefs/bkey.h605
-rw-r--r--fs/bcachefs/bkey_buf.h61
-rw-r--r--fs/bcachefs/bkey_cmp.h129
-rw-r--r--fs/bcachefs/bkey_methods.c497
-rw-r--r--fs/bcachefs/bkey_methods.h139
-rw-r--r--fs/bcachefs/bkey_sort.c214
-rw-r--r--fs/bcachefs/bkey_sort.h54
-rw-r--r--fs/bcachefs/bkey_types.h241
-rw-r--r--fs/bcachefs/bset.c1576
-rw-r--r--fs/bcachefs/bset.h536
-rw-r--r--fs/bcachefs/btree_cache.c1516
-rw-r--r--fs/bcachefs/btree_cache.h157
-rw-r--r--fs/bcachefs/btree_gc.c1308
-rw-r--r--fs/bcachefs/btree_gc.h88
-rw-r--r--fs/bcachefs/btree_gc_types.h34
-rw-r--r--fs/bcachefs/btree_io.c2742
-rw-r--r--fs/bcachefs/btree_io.h239
-rw-r--r--fs/bcachefs/btree_iter.c3804
-rw-r--r--fs/bcachefs/btree_iter.h1010
-rw-r--r--fs/bcachefs/btree_journal_iter.c830
-rw-r--r--fs/bcachefs/btree_journal_iter.h102
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h37
-rw-r--r--fs/bcachefs/btree_key_cache.c880
-rw-r--r--fs/bcachefs/btree_key_cache.h59
-rw-r--r--fs/bcachefs/btree_key_cache_types.h34
-rw-r--r--fs/bcachefs/btree_locking.c936
-rw-r--r--fs/bcachefs/btree_locking.h466
-rw-r--r--fs/bcachefs/btree_node_scan.c611
-rw-r--r--fs/bcachefs/btree_node_scan.h11
-rw-r--r--fs/bcachefs/btree_node_scan_types.h31
-rw-r--r--fs/bcachefs/btree_trans_commit.c1121
-rw-r--r--fs/bcachefs/btree_types.h937
-rw-r--r--fs/bcachefs/btree_update.c916
-rw-r--r--fs/bcachefs/btree_update.h429
-rw-r--r--fs/bcachefs/btree_update_interior.c2854
-rw-r--r--fs/bcachefs/btree_update_interior.h364
-rw-r--r--fs/bcachefs/btree_write_buffer.c893
-rw-r--r--fs/bcachefs/btree_write_buffer.h113
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h59
-rw-r--r--fs/bcachefs/buckets.c1395
-rw-r--r--fs/bcachefs/buckets.h369
-rw-r--r--fs/bcachefs/buckets_types.h100
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c174
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h15
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal_types.h23
-rw-r--r--fs/bcachefs/chardev.c843
-rw-r--r--fs/bcachefs/chardev.h31
-rw-r--r--fs/bcachefs/checksum.c698
-rw-r--r--fs/bcachefs/checksum.h240
-rw-r--r--fs/bcachefs/clock.c181
-rw-r--r--fs/bcachefs/clock.h29
-rw-r--r--fs/bcachefs/clock_types.h38
-rw-r--r--fs/bcachefs/compress.c773
-rw-r--r--fs/bcachefs/compress.h73
-rw-r--r--fs/bcachefs/darray.c38
-rw-r--r--fs/bcachefs/darray.h158
-rw-r--r--fs/bcachefs/data_update.c1021
-rw-r--r--fs/bcachefs/data_update.h93
-rw-r--r--fs/bcachefs/debug.c996
-rw-r--r--fs/bcachefs/debug.h50
-rw-r--r--fs/bcachefs/dirent.c766
-rw-r--r--fs/bcachefs/dirent.h119
-rw-r--r--fs/bcachefs/dirent_format.h58
-rw-r--r--fs/bcachefs/disk_accounting.c1074
-rw-r--r--fs/bcachefs/disk_accounting.h301
-rw-r--r--fs/bcachefs/disk_accounting_format.h225
-rw-r--r--fs/bcachefs/disk_accounting_types.h19
-rw-r--r--fs/bcachefs/disk_groups.c591
-rw-r--r--fs/bcachefs/disk_groups.h111
-rw-r--r--fs/bcachefs/disk_groups_format.h21
-rw-r--r--fs/bcachefs/disk_groups_types.h18
-rw-r--r--fs/bcachefs/ec.c2405
-rw-r--r--fs/bcachefs/ec.h309
-rw-r--r--fs/bcachefs/ec_format.h43
-rw-r--r--fs/bcachefs/ec_types.h35
-rw-r--r--fs/bcachefs/enumerated_ref.c144
-rw-r--r--fs/bcachefs/enumerated_ref.h66
-rw-r--r--fs/bcachefs/enumerated_ref_types.h19
-rw-r--r--fs/bcachefs/errcode.c73
-rw-r--r--fs/bcachefs/errcode.h387
-rw-r--r--fs/bcachefs/error.c771
-rw-r--r--fs/bcachefs/error.h258
-rw-r--r--fs/bcachefs/extent_update.c155
-rw-r--r--fs/bcachefs/extent_update.h12
-rw-r--r--fs/bcachefs/extents.c1735
-rw-r--r--fs/bcachefs/extents.h768
-rw-r--r--fs/bcachefs/extents_format.h304
-rw-r--r--fs/bcachefs/extents_types.h42
-rw-r--r--fs/bcachefs/eytzinger.c315
-rw-r--r--fs/bcachefs/eytzinger.h300
-rw-r--r--fs/bcachefs/fast_list.c156
-rw-r--r--fs/bcachefs/fast_list.h41
-rw-r--r--fs/bcachefs/fifo.h127
-rw-r--r--fs/bcachefs/fs-io-buffered.c1109
-rw-r--r--fs/bcachefs/fs-io-buffered.h27
-rw-r--r--fs/bcachefs/fs-io-direct.c704
-rw-r--r--fs/bcachefs/fs-io-direct.h16
-rw-r--r--fs/bcachefs/fs-io-pagecache.c827
-rw-r--r--fs/bcachefs/fs-io-pagecache.h176
-rw-r--r--fs/bcachefs/fs-io.c1102
-rw-r--r--fs/bcachefs/fs-io.h184
-rw-r--r--fs/bcachefs/fs-ioctl.c442
-rw-r--r--fs/bcachefs/fs-ioctl.h8
-rw-r--r--fs/bcachefs/fs.c2768
-rw-r--r--fs/bcachefs/fs.h215
-rw-r--r--fs/bcachefs/fsck.c3363
-rw-r--r--fs/bcachefs/fsck.h34
-rw-r--r--fs/bcachefs/inode.c1566
-rw-r--r--fs/bcachefs/inode.h319
-rw-r--r--fs/bcachefs/inode_format.h185
-rw-r--r--fs/bcachefs/io_misc.c570
-rw-r--r--fs/bcachefs/io_misc.h36
-rw-r--r--fs/bcachefs/io_read.c1543
-rw-r--r--fs/bcachefs/io_read.h216
-rw-r--r--fs/bcachefs/io_write.c1780
-rw-r--r--fs/bcachefs/io_write.h77
-rw-r--r--fs/bcachefs/io_write_types.h129
-rw-r--r--fs/bcachefs/journal.c1832
-rw-r--r--fs/bcachefs/journal.h465
-rw-r--r--fs/bcachefs/journal_io.c2242
-rw-r--r--fs/bcachefs/journal_io.h94
-rw-r--r--fs/bcachefs/journal_reclaim.c1037
-rw-r--r--fs/bcachefs/journal_reclaim.h84
-rw-r--r--fs/bcachefs/journal_sb.c232
-rw-r--r--fs/bcachefs/journal_sb.h24
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c264
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h23
-rw-r--r--fs/bcachefs/journal_seq_blacklist_format.h15
-rw-r--r--fs/bcachefs/journal_types.h342
-rw-r--r--fs/bcachefs/keylist.c50
-rw-r--r--fs/bcachefs/keylist.h72
-rw-r--r--fs/bcachefs/keylist_types.h16
-rw-r--r--fs/bcachefs/logged_ops.c119
-rw-r--r--fs/bcachefs/logged_ops.h20
-rw-r--r--fs/bcachefs/logged_ops_format.h35
-rw-r--r--fs/bcachefs/lru.c223
-rw-r--r--fs/bcachefs/lru.h70
-rw-r--r--fs/bcachefs/lru_format.h27
-rw-r--r--fs/bcachefs/mean_and_variance.c173
-rw-r--r--fs/bcachefs/mean_and_variance.h203
-rw-r--r--fs/bcachefs/mean_and_variance_test.c221
-rw-r--r--fs/bcachefs/migrate.c277
-rw-r--r--fs/bcachefs/migrate.h8
-rw-r--r--fs/bcachefs/move.c1494
-rw-r--r--fs/bcachefs/move.h165
-rw-r--r--fs/bcachefs/move_types.h46
-rw-r--r--fs/bcachefs/movinggc.c476
-rw-r--r--fs/bcachefs/movinggc.h20
-rw-r--r--fs/bcachefs/namei.c1034
-rw-r--r--fs/bcachefs/namei.h79
-rw-r--r--fs/bcachefs/nocow_locking.c142
-rw-r--r--fs/bcachefs/nocow_locking.h50
-rw-r--r--fs/bcachefs/nocow_locking_types.h20
-rw-r--r--fs/bcachefs/opts.c844
-rw-r--r--fs/bcachefs/opts.h693
-rw-r--r--fs/bcachefs/printbuf.c528
-rw-r--r--fs/bcachefs/printbuf.h298
-rw-r--r--fs/bcachefs/progress.c61
-rw-r--r--fs/bcachefs/progress.h29
-rw-r--r--fs/bcachefs/quota.c892
-rw-r--r--fs/bcachefs/quota.h73
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/quota_types.h43
-rw-r--r--fs/bcachefs/rcu_pending.c666
-rw-r--r--fs/bcachefs/rcu_pending.h27
-rw-r--r--fs/bcachefs/rebalance.c889
-rw-r--r--fs/bcachefs/rebalance.h59
-rw-r--r--fs/bcachefs/rebalance_format.h53
-rw-r--r--fs/bcachefs/rebalance_types.h41
-rw-r--r--fs/bcachefs/recovery.c1306
-rw-r--r--fs/bcachefs/recovery.h13
-rw-r--r--fs/bcachefs/recovery_passes.c646
-rw-r--r--fs/bcachefs/recovery_passes.h48
-rw-r--r--fs/bcachefs/recovery_passes_format.h106
-rw-r--r--fs/bcachefs/recovery_passes_types.h27
-rw-r--r--fs/bcachefs/reflink.c865
-rw-r--r--fs/bcachefs/reflink.h87
-rw-r--r--fs/bcachefs/reflink_format.h38
-rw-r--r--fs/bcachefs/replicas.c918
-rw-r--r--fs/bcachefs/replicas.h83
-rw-r--r--fs/bcachefs/replicas_format.h36
-rw-r--r--fs/bcachefs/replicas_types.h11
-rw-r--r--fs/bcachefs/sb-clean.c340
-rw-r--r--fs/bcachefs/sb-clean.h16
-rw-r--r--fs/bcachefs/sb-counters.c147
-rw-r--r--fs/bcachefs/sb-counters.h20
-rw-r--r--fs/bcachefs/sb-counters_format.h117
-rw-r--r--fs/bcachefs/sb-downgrade.c457
-rw-r--r--fs/bcachefs/sb-downgrade.h12
-rw-r--r--fs/bcachefs/sb-downgrade_format.h17
-rw-r--r--fs/bcachefs/sb-errors.c198
-rw-r--r--fs/bcachefs/sb-errors.h22
-rw-r--r--fs/bcachefs/sb-errors_format.h353
-rw-r--r--fs/bcachefs/sb-errors_types.h15
-rw-r--r--fs/bcachefs/sb-members.c606
-rw-r--r--fs/bcachefs/sb-members.h377
-rw-r--r--fs/bcachefs/sb-members_format.h128
-rw-r--r--fs/bcachefs/sb-members_types.h22
-rw-r--r--fs/bcachefs/seqmutex.h45
-rw-r--r--fs/bcachefs/siphash.c173
-rw-r--r--fs/bcachefs/siphash.h87
-rw-r--r--fs/bcachefs/six.c878
-rw-r--r--fs/bcachefs/six.h388
-rw-r--r--fs/bcachefs/snapshot.c2043
-rw-r--r--fs/bcachefs/snapshot.h275
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/snapshot_types.h57
-rw-r--r--fs/bcachefs/str_hash.c400
-rw-r--r--fs/bcachefs/str_hash.h431
-rw-r--r--fs/bcachefs/subvolume.c752
-rw-r--r--fs/bcachefs/subvolume.h88
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/subvolume_types.h11
-rw-r--r--fs/bcachefs/super-io.c1562
-rw-r--r--fs/bcachefs/super-io.h119
-rw-r--r--fs/bcachefs/super.c2547
-rw-r--r--fs/bcachefs/super.h55
-rw-r--r--fs/bcachefs/super_types.h35
-rw-r--r--fs/bcachefs/sysfs.c914
-rw-r--r--fs/bcachefs/sysfs.h49
-rw-r--r--fs/bcachefs/tests.c891
-rw-r--r--fs/bcachefs/tests.h15
-rw-r--r--fs/bcachefs/thread_with_file.c494
-rw-r--r--fs/bcachefs/thread_with_file.h81
-rw-r--r--fs/bcachefs/thread_with_file_types.h20
-rw-r--r--fs/bcachefs/time_stats.c191
-rw-r--r--fs/bcachefs/time_stats.h161
-rw-r--r--fs/bcachefs/trace.c18
-rw-r--r--fs/bcachefs/trace.h1883
-rw-r--r--fs/bcachefs/two_state_shared_lock.c8
-rw-r--r--fs/bcachefs/two_state_shared_lock.h58
-rw-r--r--fs/bcachefs/util.c1047
-rw-r--r--fs/bcachefs/util.h782
-rw-r--r--fs/bcachefs/varint.c130
-rw-r--r--fs/bcachefs/varint.h11
-rw-r--r--fs/bcachefs/vstructs.h63
-rw-r--r--fs/bcachefs/xattr.c642
-rw-r--r--fs/bcachefs/xattr.h50
-rw-r--r--fs/bcachefs/xattr_format.h25
-rw-r--r--fs/binfmt_elf.c48
-rw-r--r--fs/btrfs/Kconfig12
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/accessors.c2
-rw-r--r--fs/btrfs/backref.c26
-rw-r--r--fs/btrfs/backref.h4
-rw-r--r--fs/btrfs/bio.c54
-rw-r--r--fs/btrfs/bio.h2
-rw-r--r--fs/btrfs/block-group.c41
-rw-r--r--fs/btrfs/block-group.h2
-rw-r--r--fs/btrfs/btrfs_inode.h23
-rw-r--r--fs/btrfs/compression.c265
-rw-r--r--fs/btrfs/compression.h61
-rw-r--r--fs/btrfs/ctree.c135
-rw-r--r--fs/btrfs/defrag.c4
-rw-r--r--fs/btrfs/delayed-inode.c189
-rw-r--r--fs/btrfs/delayed-inode.h93
-rw-r--r--fs/btrfs/delayed-ref.c13
-rw-r--r--fs/btrfs/delayed-ref.h9
-rw-r--r--fs/btrfs/dev-replace.c12
-rw-r--r--fs/btrfs/direct-io.c12
-rw-r--r--fs/btrfs/disk-io.c99
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-io-tree.c4
-rw-r--r--fs/btrfs/extent-io-tree.h2
-rw-r--r--fs/btrfs/extent-tree.c104
-rw-r--r--fs/btrfs/extent-tree.h7
-rw-r--r--fs/btrfs/extent_io.c167
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c24
-rw-r--r--fs/btrfs/fiemap.c2
-rw-r--r--fs/btrfs/file-item.c60
-rw-r--r--fs/btrfs/file.c49
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/free-space-tree.c60
-rw-r--r--fs/btrfs/fs.c48
-rw-r--r--fs/btrfs/fs.h41
-rw-r--r--fs/btrfs/inode-item.c10
-rw-r--r--fs/btrfs/inode.c580
-rw-r--r--fs/btrfs/ioctl.c69
-rw-r--r--fs/btrfs/locking.c2
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/lzo.c93
-rw-r--r--fs/btrfs/messages.c1
-rw-r--r--fs/btrfs/messages.h1
-rw-r--r--fs/btrfs/misc.h49
-rw-r--r--fs/btrfs/print-tree.c256
-rw-r--r--fs/btrfs/qgroup.c50
-rw-r--r--fs/btrfs/raid-stripe-tree.c17
-rw-r--r--fs/btrfs/raid56.c121
-rw-r--r--fs/btrfs/raid56.h4
-rw-r--r--fs/btrfs/ref-verify.c12
-rw-r--r--fs/btrfs/ref-verify.h4
-rw-r--r--fs/btrfs/reflink.c15
-rw-r--r--fs/btrfs/relocation.c81
-rw-r--r--fs/btrfs/root-tree.c66
-rw-r--r--fs/btrfs/scrub.c95
-rw-r--r--fs/btrfs/scrub.h2
-rw-r--r--fs/btrfs/send.c373
-rw-r--r--fs/btrfs/space-info.c8
-rw-r--r--fs/btrfs/subpage.c2
-rw-r--r--fs/btrfs/subpage.h2
-rw-r--r--fs/btrfs/super.c66
-rw-r--r--fs/btrfs/sysfs.c16
-rw-r--r--fs/btrfs/tests/delayed-refs-tests.c4
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c49
-rw-r--r--fs/btrfs/tree-checker.c43
-rw-r--r--fs/btrfs/tree-log.c1966
-rw-r--r--fs/btrfs/verity.c10
-rw-r--r--fs/btrfs/volumes.c75
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/zlib.c86
-rw-r--r--fs/btrfs/zoned.c76
-rw-r--r--fs/btrfs/zoned.h9
-rw-r--r--fs/btrfs/zstd.c198
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/crypto.c2
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c24
-rw-r--r--fs/ceph/inode.c89
-rw-r--r--fs/ceph/mds_client.c174
-rw-r--r--fs/ceph/mds_client.h18
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/configfs/mount.c2
-rw-r--r--fs/coredump.c6
-rw-r--r--fs/cramfs/inode.c11
-rw-r--r--fs/crypto/Kconfig5
-rw-r--r--fs/crypto/bio.c4
-rw-r--r--fs/crypto/crypto.c14
-rw-r--r--fs/crypto/fname.c12
-rw-r--r--fs/crypto/fscrypt_private.h30
-rw-r--r--fs/crypto/hkdf.c109
-rw-r--r--fs/crypto/hooks.c4
-rw-r--r--fs/crypto/inline_crypt.c12
-rw-r--r--fs/crypto/keyring.c30
-rw-r--r--fs/crypto/keysetup.c108
-rw-r--r--fs/crypto/policy.c11
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/debugfs/inode.c11
-rw-r--r--fs/dlm/config.c64
-rw-r--r--fs/dlm/config.h2
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lockspace.c46
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/member.c27
-rw-r--r--fs/dlm/recover.c2
-rw-r--r--fs/dlm/user.c6
-rw-r--r--fs/ecryptfs/inode.c3
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/erofs/data.c4
-rw-r--r--fs/erofs/dir.c4
-rw-r--r--fs/erofs/erofs_fs.h8
-rw-r--r--fs/erofs/inode.c40
-rw-r--r--fs/erofs/internal.h6
-rw-r--r--fs/erofs/super.c20
-rw-r--r--fs/erofs/xattr.c13
-rw-r--r--fs/erofs/zdata.c7
-rw-r--r--fs/erofs/zmap.c71
-rw-r--r--fs/eventpoll.c139
-rw-r--r--fs/exec.c4
-rw-r--r--fs/ext4/crypto.c2
-rw-r--r--fs/ext4/ext4.h8
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/super.c8
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/f2fs/f2fs.h6
-rw-r--r--fs/f2fs/super.c16
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/fcntl.c10
-rw-r--r--fs/fhandle.c14
-rw-r--r--fs/file.c5
-rw-r--r--fs/fs-writeback.c141
-rw-r--r--fs/fsopen.c70
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/fuse/fuse_i.h14
-rw-r--r--fs/fuse/inode.c20
-rw-r--r--fs/fuse/passthrough.c5
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/gfs2/file.c23
-rw-r--r--fs/gfs2/glock.c185
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/incore.h5
-rw-r--r--fs/gfs2/lock_dlm.c104
-rw-r--r--fs/gfs2/main.c5
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/trace_gfs2.h1
-rw-r--r--fs/gfs2/util.c38
-rw-r--r--fs/gfs2/util.h36
-rw-r--r--fs/hfs/bfind.c12
-rw-r--r--fs/hfs/bitmap.c4
-rw-r--r--fs/hfs/bnode.c28
-rw-r--r--fs/hfs/brec.c35
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/catalog.c129
-rw-r--r--fs/hfs/extent.c19
-rw-r--r--fs/hfs/hfs_fs.h39
-rw-r--r--fs/hfs/inode.c25
-rw-r--r--fs/hfs/mdb.c20
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/attributes.c8
-rw-r--r--fs/hfsplus/bfind.c12
-rw-r--r--fs/hfsplus/bitmap.c10
-rw-r--r--fs/hfsplus/bnode.c69
-rw-r--r--fs/hfsplus/brec.c10
-rw-r--r--fs/hfsplus/btree.c10
-rw-r--r--fs/hfsplus/catalog.c6
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/extents.c27
-rw-r--r--fs/hfsplus/hfsplus_fs.h85
-rw-r--r--fs/hfsplus/super.c41
-rw-r--r--fs/hfsplus/unicode.c48
-rw-r--r--fs/hfsplus/xattr.c10
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/init.c17
-rw-r--r--fs/inode.c118
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/iomap/buffered-io.c18
-rw-r--r--fs/iomap/direct-io.c3
-rw-r--r--fs/iomap/trace.h1
-rw-r--r--fs/kernfs/file.c58
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/locks.c4
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mount.h12
-rw-r--r--fs/namei.c186
-rw-r--r--fs/namespace.c318
-rw-r--r--fs/netfs/buffered_read.c10
-rw-r--r--fs/netfs/buffered_write.c2
-rw-r--r--fs/netfs/direct_read.c7
-rw-r--r--fs/netfs/direct_write.c6
-rw-r--r--fs/netfs/internal.h1
-rw-r--r--fs/netfs/misc.c2
-rw-r--r--fs/netfs/objects.c32
-rw-r--r--fs/netfs/read_pgpriv2.c2
-rw-r--r--fs/netfs/read_single.c2
-rw-r--r--fs/netfs/write_issue.c3
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/file.c40
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c21
-rw-r--r--fs/nfs/inode.c19
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/io.c13
-rw-r--r--fs/nfs/localio.c21
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs42proc.c35
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c7
-rw-r--r--fs/nfs/nfs4renewd.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/sysfs.c4
-rw-r--r--fs/nilfs2/sysfs.h8
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/mark.c4
-rw-r--r--fs/nsfs.c211
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c3
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c5
-rw-r--r--fs/ocfs2/extent_map.c10
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/ocfs2/refcounttree.c4
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/overlayfs.h3
-rw-r--r--fs/overlayfs/readdir.c28
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/pidfs.c4
-rw-r--r--fs/pipe.c6
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/generic.c39
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/namespaces.c6
-rw-r--r--fs/proc/root.c98
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/pstore/zone.c21
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c14
-rw-r--r--fs/resctrl/ctrlmondata.c28
-rw-r--r--fs/resctrl/internal.h62
-rw-r--r--fs/resctrl/monitor.c1014
-rw-r--r--fs/resctrl/rdtgroup.c259
-rw-r--r--fs/smb/client/cifs_debug.c112
-rw-r--r--fs/smb/client/cifs_unicode.c3
-rw-r--r--fs/smb/client/cifsfs.c18
-rw-r--r--fs/smb/client/cifsglob.h22
-rw-r--r--fs/smb/client/cifsproto.h4
-rw-r--r--fs/smb/client/file.c34
-rw-r--r--fs/smb/client/inode.c107
-rw-r--r--fs/smb/client/misc.c38
-rw-r--r--fs/smb/client/reparse.c2
-rw-r--r--fs/smb/client/smb1ops.c4
-rw-r--r--fs/smb/client/smb2glob.h3
-rw-r--r--fs/smb/client/smb2inode.c289
-rw-r--r--fs/smb/client/smb2misc.c19
-rw-r--r--fs/smb/client/smb2ops.c40
-rw-r--r--fs/smb/client/smb2pdu.c6
-rw-r--r--fs/smb/client/smb2proto.h3
-rw-r--r--fs/smb/client/smbdirect.c1222
-rw-r--r--fs/smb/client/smbdirect.h102
-rw-r--r--fs/smb/client/trace.h61
-rw-r--r--fs/smb/common/smbdirect/smbdirect.h7
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h319
-rw-r--r--fs/smb/server/connection.c4
-rw-r--r--fs/smb/server/connection.h10
-rw-r--r--fs/smb/server/ksmbd_work.c2
-rw-r--r--fs/smb/server/server.c1
-rw-r--r--fs/smb/server/smb2pdu.c48
-rw-r--r--fs/smb/server/smb2pdu.h6
-rw-r--r--fs/smb/server/transport_rdma.c1781
-rw-r--r--fs/smb/server/transport_rdma.h45
-rw-r--r--fs/smb/server/vfs.c11
-rw-r--r--fs/smb/server/vfs_cache.h2
-rw-r--r--fs/super.c74
-rw-r--r--fs/ubifs/crypto.c2
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/verity/enable.c18
-rw-r--r--fs/verity/fsverity_private.h11
-rw-r--r--fs/verity/hash_algs.c3
-rw-r--r--fs/verity/open.c23
-rw-r--r--fs/verity/verify.c177
-rw-r--r--fs/xfs/Kconfig22
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c7
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c25
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c31
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h114
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c6
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c11
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h150
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c2
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c7
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_zones.h7
-rw-r--r--fs/xfs/scrub/cow_repair.c4
-rw-r--r--fs/xfs/scrub/metapath.c12
-rw-r--r--fs/xfs/scrub/newbt.c9
-rw-r--r--fs/xfs/scrub/reap.c620
-rw-r--r--fs/xfs/scrub/repair.c2
-rw-r--r--fs/xfs/scrub/repair.h8
-rw-r--r--fs/xfs/scrub/symlink_repair.c2
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h45
-rw-r--r--fs/xfs/xfs_attr_item.c2
-rw-r--r--fs/xfs/xfs_buf.c46
-rw-r--r--fs/xfs/xfs_buf.h4
-rw-r--r--fs/xfs/xfs_buf_item_recover.c10
-rw-r--r--fs/xfs/xfs_error.c216
-rw-r--r--fs/xfs/xfs_error.h47
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_extfree_item.h4
-rw-r--r--fs/xfs/xfs_file.c75
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_inode.c119
-rw-r--r--fs/xfs/xfs_inode_item.c125
-rw-r--r--fs/xfs/xfs_inode_item.h10
-rw-r--r--fs/xfs/xfs_ioctl.c24
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c14
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c38
-rw-r--r--fs/xfs/xfs_log.h37
-rw-r--r--fs/xfs/xfs_log_priv.h4
-rw-r--r--fs/xfs/xfs_log_recover.c34
-rw-r--r--fs/xfs/xfs_mount.c13
-rw-r--r--fs/xfs/xfs_mount.h12
-rw-r--r--fs/xfs/xfs_mru_cache.c3
-rw-r--r--fs/xfs/xfs_notify_failure.c2
-rw-r--r--fs/xfs/xfs_super.c84
-rw-r--r--fs/xfs/xfs_sysctl.c29
-rw-r--r--fs/xfs/xfs_sysctl.h3
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--fs/xfs/xfs_trans.c23
-rw-r--r--fs/xfs/xfs_trans_ail.c2
-rw-r--r--fs/xfs/xfs_zone_alloc.c120
640 files changed, 12117 insertions, 124013 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 795c6388744c..1581ebac5bb4 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
/*
* in case of non cached mode always drop the
* inode because we want the inode attribute
diff --git a/fs/Kconfig b/fs/Kconfig
index c654a3642897..7815379032da 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -51,7 +51,6 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
-source "fs/bcachefs/Kconfig"
source "fs/zonefs/Kconfig"
endif # BLOCK
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index bd2f530e5740..1949e25c7741 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST
This builds the exec KUnit tests, which tests boundary conditions
of various aspects of the exec internals.
+config ARCH_HAS_ELF_CORE_EFLAGS
+ bool
+ depends on BINFMT_ELF && ELF_CORE
+ default n
+ help
+ Select this option if the architecture makes use of the e_flags
+ field in the ELF header to store ABI or other architecture-specific
+ information that should be preserved in core dumps.
+
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 334654f9584b..e3523ab2e587 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,7 +121,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
-obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 69e1dd55b160..894d2bad6b6c 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
- queue_work(system_unbound_wq, &vnode->cb_work);
+ queue_work(system_dfl_wq, &vnode->cb_work);
}
}
@@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
if (reason != afs_cb_break_for_deleted &&
vnode->status.type == AFS_FTYPE_FILE &&
atomic_read(&vnode->cb_nr_mmap))
- queue_work(system_unbound_wq, &vnode->cb_work);
+ queue_work(system_dfl_wq, &vnode->cb_work);
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
} else {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index bfb69e066672..89d36e3e5c79 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1823,7 +1823,8 @@ error:
static void afs_rename_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+ struct afs_vnode *vnode = op->more_files[0].vnode;
+ struct afs_vnode *new_vnode = op->more_files[1].vnode;
_enter("op=%08x", op->debug_id);
@@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op)
op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
}
+ if (op->more_files[0].scb.have_status)
+ afs_vnode_commit_status(op, &op->more_files[0]);
+ if (op->more_files[1].scb.have_status)
+ afs_vnode_commit_status(op, &op->more_files[1]);
/* If we're moving a subdir between dirs, we need to update
* its DV counter too as the ".." will be altered.
*/
- if (S_ISDIR(vnode->netfs.inode.i_mode) &&
- op->file[0].vnode != op->file[1].vnode) {
- u64 new_dv;
+ if (op->file[0].vnode != op->file[1].vnode) {
+ if (S_ISDIR(vnode->netfs.inode.i_mode)) {
+ u64 new_dv;
- write_seqlock(&vnode->cb_lock);
+ write_seqlock(&vnode->cb_lock);
- new_dv = vnode->status.data_version + 1;
- trace_afs_set_dv(vnode, new_dv);
- vnode->status.data_version = new_dv;
- inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+ new_dv = vnode->status.data_version + 1;
+ trace_afs_set_dv(vnode, new_dv);
+ vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
- write_sequnlock(&vnode->cb_lock);
+ write_sequnlock(&vnode->cb_lock);
+ }
+
+ if ((op->rename.rename_flags & RENAME_EXCHANGE) &&
+ S_ISDIR(new_vnode->netfs.inode.i_mode)) {
+ u64 new_dv;
+
+ write_seqlock(&new_vnode->cb_lock);
+
+ new_dv = new_vnode->status.data_version + 1;
+ new_vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv);
+
+ write_sequnlock(&new_vnode->cb_lock);
+ }
}
}
@@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op)
if (S_ISDIR(vnode->netfs.inode.i_mode) &&
new_dvnode != orig_dvnode &&
test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
- afs_edit_dir_update_dotdot(vnode, new_dvnode,
- afs_edit_dir_for_rename_sub);
+ afs_edit_dir_update(vnode, &dotdot_name, new_dvnode,
+ afs_edit_dir_for_rename_sub);
new_inode = d_inode(new_dentry);
if (new_inode) {
@@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op)
/* Now we can update d_fsdata on the dentries to reflect their
* new parent's data_version.
- *
- * Note that if we ever implement RENAME_EXCHANGE, we'll have
- * to update both dentries with opposing dir versions.
*/
afs_update_dentry_version(op, new_dvp, op->dentry);
afs_update_dentry_version(op, new_dvp, op->dentry_2);
@@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op)
fscache_end_operation(&new_cres);
}
+static void afs_rename_exchange_edit_dir(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode *orig_dvnode = orig_dvp->vnode;
+ struct afs_vnode *new_dvnode = new_dvp->vnode;
+ struct afs_vnode *old_vnode = op->more_files[0].vnode;
+ struct afs_vnode *new_vnode = op->more_files[1].vnode;
+ struct dentry *old_dentry = op->dentry;
+ struct dentry *new_dentry = op->dentry_2;
+
+ _enter("op=%08x", op->debug_id);
+
+ if (new_dvnode == orig_dvnode) {
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) {
+ afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+ new_vnode, afs_edit_dir_for_rename_0);
+ afs_edit_dir_update(orig_dvnode, &new_dentry->d_name,
+ old_vnode, afs_edit_dir_for_rename_1);
+ }
+
+ d_exchange(old_dentry, new_dentry);
+ up_write(&orig_dvnode->validate_lock);
+ } else {
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
+ afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+ new_vnode, afs_edit_dir_for_rename_0);
+
+ up_write(&orig_dvnode->validate_lock);
+ down_write(&new_dvnode->validate_lock);
+
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+ new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta)
+ afs_edit_dir_update(new_dvnode, &new_dentry->d_name,
+ old_vnode, afs_edit_dir_for_rename_1);
+
+ if (S_ISDIR(old_vnode->netfs.inode.i_mode) &&
+ test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags))
+ afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode,
+ afs_edit_dir_for_rename_sub);
+
+ if (S_ISDIR(new_vnode->netfs.inode.i_mode) &&
+ test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags))
+ afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode,
+ afs_edit_dir_for_rename_sub);
+
+ /* Now we can update d_fsdata on the dentries to reflect their
+ * new parents' data_version.
+ */
+ afs_update_dentry_version(op, new_dvp, old_dentry);
+ afs_update_dentry_version(op, orig_dvp, new_dentry);
+
+ d_exchange(old_dentry, new_dentry);
+ up_write(&new_dvnode->validate_lock);
+ }
+}
+
static void afs_rename_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
@@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = {
.put = afs_rename_put,
};
+#if 0 /* Autoswitched in yfs_fs_rename_replace(). */
+static const struct afs_operation_ops afs_rename_replace_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_replace,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_edit_dir,
+ .put = afs_rename_put,
+};
+#endif
+
+static const struct afs_operation_ops afs_rename_noreplace_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_noreplace,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_edit_dir,
+ .put = afs_rename_put,
+};
+
+static const struct afs_operation_ops afs_rename_exchange_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_exchange,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_exchange_edit_dir,
+ .put = afs_rename_put,
+};
+
/*
* rename a file in an AFS filesystem and/or move it between directories
*/
@@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct afs_operation *op;
- struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+ struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL;
int ret;
- if (flags)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
/* Don't allow silly-rename files be moved around. */
@@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
vnode = AFS_FS_I(d_inode(old_dentry));
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
+ if (d_is_positive(new_dentry))
+ new_vnode = AFS_FS_I(d_inode(new_dentry));
_enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (ret < 0)
goto error;
+ ret = -ENOMEM;
+ op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+ if (!op->more_files)
+ goto error;
+
afs_op_set_vnode(op, 0, orig_dvnode);
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
@@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
+ op->more_files[0].vnode = vnode;
+ op->more_files[0].speculative = true;
+ op->more_files[1].vnode = new_vnode;
+ op->more_files[1].speculative = true;
+ op->nr_files = 4;
op->dentry = old_dentry;
op->dentry_2 = new_dentry;
+ op->rename.rename_flags = flags;
op->rename.new_negative = d_is_negative(new_dentry);
- op->ops = &afs_rename_operation;
- /* For non-directories, check whether the target is busy and if so,
- * make a copy of the dentry and then do a silly-rename. If the
- * silly-rename succeeds, the copied dentry is hashed and becomes the
- * new target.
- */
- if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
- /* To prevent any new references to the target during the
- * rename, we unhash the dentry in advance.
+ if (flags & RENAME_NOREPLACE) {
+ op->ops = &afs_rename_noreplace_operation;
+ } else if (flags & RENAME_EXCHANGE) {
+ op->ops = &afs_rename_exchange_operation;
+ d_drop(new_dentry);
+ } else {
+ /* If we might displace the target, we might need to do silly
+ * rename.
*/
- if (!d_unhashed(new_dentry)) {
- d_drop(new_dentry);
- op->rename.rehash = new_dentry;
- }
+ op->ops = &afs_rename_operation;
- if (d_count(new_dentry) > 2) {
- /* copy the target dentry's name */
- op->rename.tmp = d_alloc(new_dentry->d_parent,
- &new_dentry->d_name);
- if (!op->rename.tmp) {
- afs_op_nomem(op);
- goto error;
+ /* For non-directories, check whether the target is busy and if
+ * so, make a copy of the dentry and then do a silly-rename.
+ * If the silly-rename succeeds, the copied dentry is hashed
+ * and becomes the new target.
+ */
+ if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
+ /* To prevent any new references to the target during
+ * the rename, we unhash the dentry in advance.
+ */
+ if (!d_unhashed(new_dentry)) {
+ d_drop(new_dentry);
+ op->rename.rehash = new_dentry;
}
- ret = afs_sillyrename(new_dvnode,
- AFS_FS_I(d_inode(new_dentry)),
- new_dentry, op->key);
- if (ret) {
- afs_op_set_error(op, ret);
- goto error;
+ if (d_count(new_dentry) > 2) {
+ /* copy the target dentry's name */
+ op->rename.tmp = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!op->rename.tmp) {
+ afs_op_nomem(op);
+ goto error;
+ }
+
+ ret = afs_sillyrename(new_dvnode,
+ AFS_FS_I(d_inode(new_dentry)),
+ new_dentry, op->key);
+ if (ret) {
+ afs_op_set_error(op, ret);
+ goto error;
+ }
+
+ op->dentry_2 = op->rename.tmp;
+ op->rename.rehash = NULL;
+ op->rename.new_negative = true;
}
-
- op->dentry_2 = op->rename.tmp;
- op->rename.rehash = NULL;
- op->rename.new_negative = true;
}
}
@@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
d_drop(old_dentry);
ret = afs_do_sync_operation(op);
+ if (ret == -ENOTSUPP)
+ ret = -EINVAL;
out:
afs_dir_unuse_cookie(orig_dvnode, ret);
if (new_dvnode != orig_dvnode)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 60a549f1d9c5..4b1342c72089 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -522,11 +522,11 @@ error:
}
/*
- * Edit a subdirectory that has been moved between directories to update the
- * ".." entry.
+ * Edit an entry in a directory to update the vnode it refers to. This is also
+ * used to update the ".." entry in a directory.
*/
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
- enum afs_edit_dir_reason why)
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+ struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *block;
union afs_xdr_dirent *de;
@@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto already_invalidated;
- slot = afs_dir_scan_block(block, &dotdot_name, b);
+ slot = afs_dir_scan_block(block, name, b);
if (slot >= 0)
goto found_dirent;
@@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
/* Didn't find the dirent to clobber. Download the directory again. */
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
goto out;
@@ -576,7 +576,7 @@ found_dirent:
de->u.unique = htonl(new_dvnode->fid.unique);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
- ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+ ntohl(de->u.vnode), ntohl(de->u.unique), name->name);
kunmap_local(block);
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
@@ -589,12 +589,12 @@ out:
already_invalidated:
kunmap_local(block);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
goto out;
error:
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
goto out;
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 0b80eb93fa40..014495d4b868 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
if (IS_ERR(op))
return PTR_ERR(op);
+ op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+ if (!op->more_files) {
+ afs_put_operation(op);
+ return -ENOMEM;
+ }
+
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, dvnode);
op->file[0].dv_delta = 1;
@@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
+ op->more_files[0].vnode = AFS_FS_I(d_inode(old));
+ op->more_files[0].speculative = true;
+ op->more_files[1].vnode = AFS_FS_I(d_inode(new));
+ op->more_files[1].speculative = true;
+ op->nr_files = 4;
op->dentry = old;
op->dentry_2 = new;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e9538e91f848..e1cb17b85791 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode)
_enter("");
if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
- return generic_delete_inode(inode);
+ return inode_just_drop(inode);
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1124ea4000cb..444a3ea4fdf6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -562,6 +562,7 @@ struct afs_server {
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
+#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */
refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
@@ -891,9 +892,10 @@ struct afs_operation {
bool need_rehash;
} unlink;
struct {
- struct dentry *rehash;
- struct dentry *tmp;
- bool new_negative;
+ struct dentry *rehash;
+ struct dentry *tmp;
+ unsigned int rename_flags;
+ bool new_negative;
} rename;
struct {
struct netfs_io_subrequest *subreq;
@@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping,
extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
- enum afs_edit_dir_reason why);
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+ struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
/*
@@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *);
extern void yfs_fs_link(struct afs_operation *);
extern void yfs_fs_symlink(struct afs_operation *);
extern void yfs_fs_rename(struct afs_operation *);
+void yfs_fs_rename_replace(struct afs_operation *op);
+void yfs_fs_rename_noreplace(struct afs_operation *op);
+void yfs_fs_rename_exchange(struct afs_operation *op);
extern void yfs_fs_store_data(struct afs_operation *);
extern void yfs_fs_setattr(struct afs_operation *);
extern void yfs_fs_get_volume_status(struct afs_operation *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 02475d415d88..e6bb8237db98 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -169,13 +169,13 @@ static int __init afs_init(void)
printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
- afs_wq = alloc_workqueue("afs", 0, 0);
+ afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0);
if (!afs_wq)
goto error_afs_wq;
afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!afs_async_calls)
goto error_async;
- afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+ afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!afs_lock_manager)
goto error_lockmgr;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 8f2b3a177690..c8a7f266080d 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code)
case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG;
case RXGEN_OPCODE: return -ENOTSUPP;
+ case RX_INVALID_OPERATION: return -ENOTSUPP;
default: return -EREMOTEIO;
}
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index e4cd89c44c46..b2f06c1917c2 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -50,6 +50,9 @@ enum YFS_FS_Operations {
YFSREMOVEACL = 64171,
YFSREMOVEFILE2 = 64173,
YFSSTOREOPAQUEACL2 = 64174,
+ YFSRENAME_REPLACE = 64176,
+ YFSRENAME_NOREPLACE = 64177,
+ YFSRENAME_EXCHANGE = 64187,
YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */
YFSFETCHDATA64 = 64537, /* YFS Fetch file data */
YFSSTOREDATA64 = 64538, /* YFS Store file data */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a1c24f589d9e..6a4e7da10fc4 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op)
afs_op_set_error(op, -EDQUOT);
goto failed_but_online;
+ case RX_INVALID_OPERATION:
+ case RXGEN_OPCODE:
+ /* Handle downgrading to an older operation. */
+ afs_op_set_error(op, -ENOTSUPP);
+ if (op->flags & AFS_OPERATION_DOWNGRADE) {
+ op->flags &= ~AFS_OPERATION_DOWNGRADE;
+ goto go_again;
+ }
+ goto failed_but_online;
+
default:
afs_op_accumulate_error(op, error, abort_code);
failed_but_online:
@@ -620,12 +630,13 @@ iterate_address:
op->addr_index = addr_index;
set_bit(addr_index, &op->addr_tried);
- op->volsync.creation = TIME64_MIN;
- op->volsync.update = TIME64_MIN;
- op->call_responded = false;
_debug("address [%u] %u/%u %pISp",
op->server_index, addr_index, alist->nr_addrs,
rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
+go_again:
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
+ op->call_responded = false;
_leave(" = t");
return true;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index a97562f831eb..c4428ebddb1d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
void afs_put_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int a, debug_id = server->debug_id;
+ unsigned int a, debug_id;
bool zero;
int r;
if (!server)
return;
+ debug_id = server->debug_id;
a = atomic_read(&server->active);
zero = __refcount_dec_and_test(&server->ref, &r);
trace_afs_server(debug_id, r - 1, a, reason);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2e7526ea883a..93ad86ff3345 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work)
void afs_issue_write(struct netfs_io_subrequest *subreq)
{
subreq->work.func = afs_issue_write_worker;
- if (!queue_work(system_unbound_wq, &subreq->work))
+ if (!queue_work(system_dfl_wq, &subreq->work))
WARN_ON_ONCE(1);
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 257af259c04a..febf13a49f0b 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op)
_enter("");
+ if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags))
+ return yfs_fs_rename_replace(op);
+
call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename,
sizeof(__be32) +
sizeof(struct yfs_xdr_RPCFlags) +
@@ -1071,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op)
}
/*
+ * Deliver reply data to a YFS.Rename_NoReplace operation. This does not
+ * return the status of a displaced target inode as there cannot be one.
+ */
+static int yfs_deliver_fs_rename_1(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode_param *old_vp = &op->more_files[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+ xdr_decode_YFSFid(&bp, &old_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange
+ * operation. These return the status of the displaced target inode if there
+ * was one.
+ */
+static int yfs_deliver_fs_rename_2(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode_param *old_vp = &op->more_files[0];
+ struct afs_vnode_param *new_vp = &op->more_files[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+ xdr_decode_YFSFid(&bp, &old_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+ xdr_decode_YFSFid(&bp, &new_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+static void yfs_done_fs_rename_replace(struct afs_call *call)
+{
+ if (call->error == -ECONNABORTED &&
+ (call->abort_code == RX_INVALID_OPERATION ||
+ call->abort_code == RXGEN_OPCODE)) {
+ set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags);
+ call->op->flags |= AFS_OPERATION_DOWNGRADE;
+ }
+}
+
+/*
+ * YFS.Rename_Replace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Replace = {
+ .name = "FS.Rename_Replace",
+ .op = yfs_FS_Rename_Replace,
+ .deliver = yfs_deliver_fs_rename_2,
+ .done = yfs_done_fs_rename_replace,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_NoReplace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_NoReplace = {
+ .name = "FS.Rename_NoReplace",
+ .op = yfs_FS_Rename_NoReplace,
+ .deliver = yfs_deliver_fs_rename_1,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_Exchange operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Exchange = {
+ .name = "FS.Rename_Exchange",
+ .op = yfs_FS_Rename_Exchange,
+ .deliver = yfs_deliver_fs_rename_2,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory, replacing the target if it exists. The status
+ * of a displaced target is returned.
+ */
+void yfs_fs_rename_replace(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_REPLACE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Rename a file or directory, failing if the target dirent exists.
+ */
+void yfs_fs_rename_noreplace(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Exchange a pair of files directories.
+ */
+void yfs_fs_rename_exchange(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
* YFS.StoreData64 operation type.
*/
static const struct afs_call_type yfs_RXYFSStoreData64 = {
diff --git a/fs/aio.c b/fs/aio.c
index 7fc7b6221312..6002617f078c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
/* Synchronize against RCU protected table->table[] dereferences */
INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
- queue_rcu_work(system_wq, &ctx->free_rwork);
+ queue_rcu_work(system_percpu_wq, &ctx->free_rwork);
}
/*
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
deleted file mode 100644
index 8cb2b9d5da96..000000000000
--- a/fs/bcachefs/Kconfig
+++ /dev/null
@@ -1,121 +0,0 @@
-
-config BCACHEFS_FS
- tristate "bcachefs filesystem support (EXPERIMENTAL)"
- depends on BLOCK
- select EXPORTFS
- select CLOSURES
- select CRC32
- select CRC64
- select FS_POSIX_ACL
- select LZ4_COMPRESS
- select LZ4_DECOMPRESS
- select LZ4HC_COMPRESS
- select LZ4HC_DECOMPRESS
- select ZLIB_DEFLATE
- select ZLIB_INFLATE
- select ZSTD_COMPRESS
- select ZSTD_DECOMPRESS
- select CRYPTO_LIB_SHA256
- select CRYPTO_LIB_CHACHA
- select CRYPTO_LIB_POLY1305
- select KEYS
- select RAID6_PQ
- select XOR_BLOCKS
- select XXHASH
- select SRCU
- select SYMBOLIC_ERRNAME
- select MIN_HEAP
- select XARRAY_MULTI
- help
- The bcachefs filesystem - a modern, copy on write filesystem, with
- support for multiple devices, compression, checksumming, etc.
-
-config BCACHEFS_QUOTA
- bool "bcachefs quota support"
- depends on BCACHEFS_FS
- select QUOTACTL
-
-config BCACHEFS_ERASURE_CODING
- bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
- depends on BCACHEFS_FS
- select QUOTACTL
- help
- This enables the "erasure_code" filesysystem and inode option, which
- organizes data into reed-solomon stripes instead of ordinary
- replication.
-
- WARNING: this feature is still undergoing on disk format changes, and
- should only be enabled for testing purposes.
-
-config BCACHEFS_POSIX_ACL
- bool "bcachefs POSIX ACL support"
- depends on BCACHEFS_FS
- select FS_POSIX_ACL
-
-config BCACHEFS_DEBUG
- bool "bcachefs debugging"
- depends on BCACHEFS_FS
- help
- Enables many extra debugging checks and assertions.
-
- The resulting code will be significantly slower than normal; you
- probably shouldn't select this option unless you're a developer.
-
-config BCACHEFS_INJECT_TRANSACTION_RESTARTS
- bool "Randomly inject transaction restarts"
- depends on BCACHEFS_DEBUG
- help
- Randomly inject transaction restarts in a few core paths - may have a
- significant performance penalty
-
-config BCACHEFS_TESTS
- bool "bcachefs unit and performance tests"
- depends on BCACHEFS_FS
- help
- Include some unit and performance tests for the core btree code
-
-config BCACHEFS_LOCK_TIME_STATS
- bool "bcachefs lock time statistics"
- depends on BCACHEFS_FS
- help
- Expose statistics for how long we held a lock in debugfs
-
-config BCACHEFS_NO_LATENCY_ACCT
- bool "disable latency accounting and time stats"
- depends on BCACHEFS_FS
- help
- This disables device latency tracking and time stats, only for performance testing
-
-config BCACHEFS_SIX_OPTIMISTIC_SPIN
- bool "Optimistic spinning for six locks"
- depends on BCACHEFS_FS
- depends on SMP
- default y
- help
- Instead of immediately sleeping when attempting to take a six lock that
- is held by another thread, spin for a short while, as long as the
- thread owning the lock is running.
-
-config BCACHEFS_PATH_TRACEPOINTS
- bool "Extra btree_path tracepoints"
- depends on BCACHEFS_FS && TRACING
- help
- Enable extra tracepoints for debugging btree_path operations; we don't
- normally want these enabled because they happen at very high rates.
-
-config BCACHEFS_TRANS_KMALLOC_TRACE
- bool "Trace bch2_trans_kmalloc() calls"
- depends on BCACHEFS_FS
-
-config BCACHEFS_ASYNC_OBJECT_LISTS
- bool "Keep async objects on fast_lists for debugfs visibility"
- depends on BCACHEFS_FS && DEBUG_FS
-
-config MEAN_AND_VARIANCE_UNIT_TEST
- tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
- depends on KUNIT
- depends on BCACHEFS_FS
- default KUNIT_ALL_TESTS
- help
- This option enables the kunit tests for mean_and_variance module.
- If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
deleted file mode 100644
index 93c8ee5425c8..000000000000
--- a/fs/bcachefs/Makefile
+++ /dev/null
@@ -1,107 +0,0 @@
-
-obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
-
-bcachefs-y := \
- acl.o \
- alloc_background.o \
- alloc_foreground.o \
- backpointers.o \
- bkey.o \
- bkey_methods.o \
- bkey_sort.o \
- bset.o \
- btree_cache.o \
- btree_gc.o \
- btree_io.o \
- btree_iter.o \
- btree_journal_iter.o \
- btree_key_cache.o \
- btree_locking.o \
- btree_node_scan.o \
- btree_trans_commit.o \
- btree_update.o \
- btree_update_interior.o \
- btree_write_buffer.o \
- buckets.o \
- buckets_waiting_for_journal.o \
- chardev.o \
- checksum.o \
- clock.o \
- compress.o \
- darray.o \
- data_update.o \
- debug.o \
- dirent.o \
- disk_accounting.o \
- disk_groups.o \
- ec.o \
- enumerated_ref.o \
- errcode.o \
- error.o \
- extents.o \
- extent_update.o \
- eytzinger.o \
- fast_list.o \
- fs.o \
- fs-ioctl.o \
- fs-io.o \
- fs-io-buffered.o \
- fs-io-direct.o \
- fs-io-pagecache.o \
- fsck.o \
- inode.o \
- io_read.o \
- io_misc.o \
- io_write.o \
- journal.o \
- journal_io.o \
- journal_reclaim.o \
- journal_sb.o \
- journal_seq_blacklist.o \
- keylist.o \
- logged_ops.o \
- lru.o \
- mean_and_variance.o \
- migrate.o \
- move.o \
- movinggc.o \
- namei.o \
- nocow_locking.o \
- opts.o \
- printbuf.o \
- progress.o \
- quota.o \
- rebalance.o \
- rcu_pending.o \
- recovery.o \
- recovery_passes.o \
- reflink.o \
- replicas.o \
- sb-clean.o \
- sb-counters.o \
- sb-downgrade.o \
- sb-errors.o \
- sb-members.o \
- siphash.o \
- six.o \
- snapshot.o \
- str_hash.o \
- subvolume.o \
- super.o \
- super-io.o \
- sysfs.o \
- tests.o \
- time_stats.o \
- thread_with_file.o \
- trace.o \
- two_state_shared_lock.o \
- util.o \
- varint.o \
- xattr.o
-
-bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
-
-# Silence "note: xyz changed in GCC X.X" messages
-subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
deleted file mode 100644
index d03adc36100e..000000000000
--- a/fs/bcachefs/acl.c
+++ /dev/null
@@ -1,445 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-
-#include "acl.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static const char * const acl_types[] = {
- [ACL_USER_OBJ] = "user_obj",
- [ACL_USER] = "user",
- [ACL_GROUP_OBJ] = "group_obj",
- [ACL_GROUP] = "group",
- [ACL_MASK] = "mask",
- [ACL_OTHER] = "other",
- NULL,
-};
-
-void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
-{
- const void *p, *end = value + size;
-
- if (!value ||
- size < sizeof(bch_acl_header) ||
- ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
- return;
-
- p = value + sizeof(bch_acl_header);
- while (p < end) {
- const bch_acl_entry *in = p;
- unsigned tag = le16_to_cpu(in->e_tag);
-
- prt_str(out, acl_types[tag]);
-
- switch (tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- p += sizeof(bch_acl_entry_short);
- break;
- case ACL_USER:
- prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- }
-
- prt_printf(out, " %o", le16_to_cpu(in->e_perm));
-
- if (p != end)
- prt_char(out, ' ');
- }
-}
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-#include "fs.h"
-
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
-{
- return sizeof(bch_acl_header) +
- sizeof(bch_acl_entry_short) * nr_short +
- sizeof(bch_acl_entry) * nr_long;
-}
-
-static inline int acl_to_xattr_type(int type)
-{
- switch (type) {
- case ACL_TYPE_ACCESS:
- return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
- case ACL_TYPE_DEFAULT:
- return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
- default:
- BUG();
- }
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
- const void *value, size_t size)
-{
- const void *p, *end = value + size;
- struct posix_acl *acl;
- struct posix_acl_entry *out;
- unsigned count = 0;
- int ret;
-
- if (!value)
- return NULL;
- if (size < sizeof(bch_acl_header))
- goto invalid;
- if (((bch_acl_header *)value)->a_version !=
- cpu_to_le32(BCH_ACL_VERSION))
- goto invalid;
-
- p = value + sizeof(bch_acl_header);
- while (p < end) {
- const bch_acl_entry *entry = p;
-
- if (p + sizeof(bch_acl_entry_short) > end)
- goto invalid;
-
- switch (le16_to_cpu(entry->e_tag)) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- p += sizeof(bch_acl_entry_short);
- break;
- case ACL_USER:
- case ACL_GROUP:
- p += sizeof(bch_acl_entry);
- break;
- default:
- goto invalid;
- }
-
- count++;
- }
-
- if (p > end)
- goto invalid;
-
- if (!count)
- return NULL;
-
- acl = allocate_dropping_locks(trans, ret,
- posix_acl_alloc(count, _gfp));
- if (!acl)
- return ERR_PTR(-ENOMEM);
- if (ret) {
- kfree(acl);
- return ERR_PTR(ret);
- }
-
- out = acl->a_entries;
-
- p = value + sizeof(bch_acl_header);
- while (p < end) {
- const bch_acl_entry *in = p;
-
- out->e_tag = le16_to_cpu(in->e_tag);
- out->e_perm = le16_to_cpu(in->e_perm);
-
- switch (out->e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- p += sizeof(bch_acl_entry_short);
- break;
- case ACL_USER:
- out->e_uid = make_kuid(&init_user_ns,
- le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- out->e_gid = make_kgid(&init_user_ns,
- le32_to_cpu(in->e_id));
- p += sizeof(bch_acl_entry);
- break;
- }
-
- out++;
- }
-
- BUG_ON(out != acl->a_entries + acl->a_count);
-
- return acl;
-invalid:
- pr_err("invalid acl entry");
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static struct bkey_i_xattr *
-bch2_acl_to_xattr(struct btree_trans *trans,
- const struct posix_acl *acl,
- int type)
-{
- struct bkey_i_xattr *xattr;
- bch_acl_header *acl_header;
- const struct posix_acl_entry *acl_e, *pe;
- void *outptr;
- unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
-
- FOREACH_ACL_ENTRY(acl_e, acl, pe) {
- switch (acl_e->e_tag) {
- case ACL_USER:
- case ACL_GROUP:
- nr_long++;
- break;
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- nr_short++;
- break;
- default:
- return ERR_PTR(-EINVAL);
- }
- }
-
- acl_len = bch2_acl_size(nr_short, nr_long);
- u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
-
- if (u64s > U8_MAX)
- return ERR_PTR(-E2BIG);
-
- xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(xattr))
- return xattr;
-
- bkey_xattr_init(&xattr->k_i);
- xattr->k.u64s = u64s;
- xattr->v.x_type = acl_to_xattr_type(type);
- xattr->v.x_name_len = 0;
- xattr->v.x_val_len = cpu_to_le16(acl_len);
-
- acl_header = xattr_val(&xattr->v);
- acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
-
- outptr = (void *) acl_header + sizeof(*acl_header);
-
- FOREACH_ACL_ENTRY(acl_e, acl, pe) {
- bch_acl_entry *entry = outptr;
-
- entry->e_tag = cpu_to_le16(acl_e->e_tag);
- entry->e_perm = cpu_to_le16(acl_e->e_perm);
- switch (acl_e->e_tag) {
- case ACL_USER:
- entry->e_id = cpu_to_le32(
- from_kuid(&init_user_ns, acl_e->e_uid));
- outptr += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- entry->e_id = cpu_to_le32(
- from_kgid(&init_user_ns, acl_e->e_gid));
- outptr += sizeof(bch_acl_entry);
- break;
-
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- outptr += sizeof(bch_acl_entry_short);
- break;
- }
- }
-
- BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
-
- return xattr;
-}
-
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_iter iter = {};
- struct posix_acl *acl = NULL;
-
- if (rcu)
- return ERR_PTR(-ECHILD);
-
- struct btree_trans *trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash, inode_inum(inode), &search, 0);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
- ret = PTR_ERR_OR_ZERO(acl);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (ret)
- acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
-
- if (!IS_ERR_OR_NULL(acl))
- set_cached_acl(&inode->v, type, acl);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return acl;
-}
-
-int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode_u,
- struct posix_acl *acl, int type)
-{
- struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
- int ret;
-
- if (type == ACL_TYPE_DEFAULT &&
- !S_ISDIR(inode_u->bi_mode))
- return acl ? -EACCES : 0;
-
- if (acl) {
- struct bkey_i_xattr *xattr =
- bch2_acl_to_xattr(trans, acl, type);
- if (IS_ERR(xattr))
- return PTR_ERR(xattr);
-
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
- inum, &xattr->k_i, 0);
- } else {
- struct xattr_search_key search =
- X_SEARCH(acl_to_xattr_type(type), "", 0);
-
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
- inum, &search);
- }
-
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-}
-
-int bch2_set_acl(struct mnt_idmap *idmap,
- struct dentry *dentry,
- struct posix_acl *_acl, int type)
-{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_iter inode_iter = {};
- struct bch_inode_unpacked inode_u;
- struct posix_acl *acl;
- umode_t mode;
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
- struct btree_trans *trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
- acl = _acl;
-
- ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_intent);
- if (ret)
- goto btree_err;
-
- mode = inode_u.bi_mode;
-
- if (type == ACL_TYPE_ACCESS) {
- ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
- if (ret)
- goto btree_err;
- }
-
- ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
- if (ret)
- goto btree_err;
-
- inode_u.bi_ctime = bch2_current_time(c);
- inode_u.bi_mode = mode;
-
- ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-btree_err:
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (unlikely(ret))
- goto err;
-
- bch2_inode_update_after_write(trans, inode, &inode_u,
- ATTR_CTIME|ATTR_MODE);
-
- set_cached_acl(&inode->v, type, acl);
-err:
- bch2_trans_put(trans);
- mutex_unlock(&inode->ei_update_lock);
-
- return ret;
-}
-
-int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode,
- umode_t mode,
- struct posix_acl **new_acl)
-{
- struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
- struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
- struct btree_iter iter;
- struct posix_acl *acl = NULL;
-
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
- &hash_info, inum, &search, BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
- acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
- ret = PTR_ERR_OR_ZERO(acl);
- if (ret)
- goto err;
-
- ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode));
- if (ret)
- goto err;
-
- struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- new->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
- *new_acl = acl;
- acl = NULL;
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (!IS_ERR_OR_NULL(acl))
- kfree(acl);
- return ret;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
deleted file mode 100644
index fe730a6bf0c1..000000000000
--- a/fs/bcachefs/acl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ACL_H
-#define _BCACHEFS_ACL_H
-
-struct bch_inode_unpacked;
-struct bch_hash_info;
-struct bch_inode_info;
-struct posix_acl;
-
-#define BCH_ACL_VERSION 0x0001
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
- __le32 e_id;
-} bch_acl_entry;
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
-} bch_acl_entry_short;
-
-typedef struct {
- __le32 a_version;
-} bch_acl_header;
-
-void bch2_acl_to_text(struct printbuf *, const void *, size_t);
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
-struct posix_acl *bch2_get_acl(struct inode *, int, bool);
-
-int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- struct posix_acl *, int);
-int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- umode_t, struct posix_acl **);
-
-#else
-
-static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode_u,
- struct posix_acl *acl, int type)
-{
- return 0;
-}
-
-static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode,
- umode_t mode,
- struct posix_acl **new_acl)
-{
- return 0;
-}
-
-#endif /* CONFIG_BCACHEFS_POSIX_ACL */
-
-#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
deleted file mode 100644
index 66de46318620..000000000000
--- a/fs/bcachefs/alloc_background.c
+++ /dev/null
@@ -1,2680 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-#include "varint.h"
-
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-#include <linux/sort.h>
-#include <linux/jiffies.h>
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
-
-/* Persistent alloc info: */
-
-static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bkey_alloc_unpacked {
- u64 journal_seq;
- u8 gen;
- u8 oldest_gen;
- u8 data_type;
- bool need_discard:1;
- bool need_inc_gen:1;
-#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS_V2()
-#undef x
-};
-
-static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
- const void **p, unsigned field)
-{
- unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
- u64 v;
-
- if (!(a->fields & (1 << field)))
- return 0;
-
- switch (bytes) {
- case 1:
- v = *((const u8 *) *p);
- break;
- case 2:
- v = le16_to_cpup(*p);
- break;
- case 4:
- v = le32_to_cpup(*p);
- break;
- case 8:
- v = le64_to_cpup(*p);
- break;
- default:
- BUG();
- }
-
- *p += bytes;
- return v;
-}
-
-static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
- struct bkey_s_c k)
-{
- const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
- const void *d = in->data;
- unsigned idx = 0;
-
- out->gen = in->gen;
-
-#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
- BCH_ALLOC_FIELDS_V1()
-#undef x
-}
-
-static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
- struct bkey_s_c k)
-{
- struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
- const u8 *in = a.v->data;
- const u8 *end = bkey_val_end(a);
- unsigned fieldnr = 0;
- int ret;
- u64 v;
-
- out->gen = a.v->gen;
- out->oldest_gen = a.v->oldest_gen;
- out->data_type = a.v->data_type;
-
-#define x(_name, _bits) \
- if (fieldnr < a.v->nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v = 0; \
- } \
- out->_name = v; \
- if (v != out->_name) \
- return -1; \
- fieldnr++;
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- return 0;
-}
-
-static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
- struct bkey_s_c k)
-{
- struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
- const u8 *in = a.v->data;
- const u8 *end = bkey_val_end(a);
- unsigned fieldnr = 0;
- int ret;
- u64 v;
-
- out->gen = a.v->gen;
- out->oldest_gen = a.v->oldest_gen;
- out->data_type = a.v->data_type;
- out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
- out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
- out->journal_seq = le64_to_cpu(a.v->journal_seq);
-
-#define x(_name, _bits) \
- if (fieldnr < a.v->nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v = 0; \
- } \
- out->_name = v; \
- if (v != out->_name) \
- return -1; \
- fieldnr++;
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- return 0;
-}
-
-static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
- struct bkey_alloc_unpacked ret = { .gen = 0 };
-
- switch (k.k->type) {
- case KEY_TYPE_alloc:
- bch2_alloc_unpack_v1(&ret, k);
- break;
- case KEY_TYPE_alloc_v2:
- bch2_alloc_unpack_v2(&ret, k);
- break;
- case KEY_TYPE_alloc_v3:
- bch2_alloc_unpack_v3(&ret, k);
- break;
- }
-
- return ret;
-}
-
-static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
-{
- unsigned i, bytes = offsetof(struct bch_alloc, data);
-
- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
- if (a->fields & (1 << i))
- bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
-
- return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- int ret = 0;
-
- /* allow for unknown fields */
- bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
- c, alloc_v1_val_size_bad,
- "incorrect value size (%zu < %u)",
- bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-fsck_err:
- return ret;
-}
-
-int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_alloc_unpacked u;
- int ret = 0;
-
- bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
- c, alloc_v2_unpack_error,
- "unpack error");
-fsck_err:
- return ret;
-}
-
-int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_alloc_unpacked u;
- int ret = 0;
-
- bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
- c, alloc_v3_unpack_error,
- "unpack error");
-fsck_err:
- return ret;
-}
-
-int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bch_alloc_v4 a;
- int ret = 0;
-
- bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
-
- bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
- c, alloc_v4_val_size_bad,
- "bad val size (%u > %zu)",
- alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
-
- bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
- c, alloc_v4_backpointers_start_bad,
- "invalid backpointers_start");
-
- bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
- c, alloc_key_data_type_bad,
- "invalid data type (got %u should be %u)",
- a.data_type, alloc_data_type(a, a.data_type));
-
- for (unsigned i = 0; i < 2; i++)
- bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
- c, alloc_key_io_time_bad,
- "invalid io_time[%s]: %llu, max %llu",
- i == READ ? "read" : "write",
- a.io_time[i], LRU_TIME_MAX);
-
- unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
- offsetof(struct bch_alloc_v4, stripe_sectors)
- ? a.stripe_sectors
- : 0;
-
- switch (a.data_type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- bkey_fsck_err_on(stripe_sectors ||
- a.dirty_sectors ||
- a.cached_sectors ||
- a.stripe,
- c, alloc_key_empty_but_have_data,
- "empty data type free but have data %u.%u.%u %u",
- stripe_sectors,
- a.dirty_sectors,
- a.cached_sectors,
- a.stripe);
- break;
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- bkey_fsck_err_on(!a.dirty_sectors &&
- !stripe_sectors,
- c, alloc_key_dirty_sectors_0,
- "data_type %s but dirty_sectors==0",
- bch2_data_type_str(a.data_type));
- break;
- case BCH_DATA_cached:
- bkey_fsck_err_on(!a.cached_sectors ||
- a.dirty_sectors ||
- stripe_sectors ||
- a.stripe,
- c, alloc_key_cached_inconsistency,
- "data type inconsistency");
-
- bkey_fsck_err_on(!a.io_time[READ] &&
- !(c->recovery.passes_to_run &
- BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
- c, alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time == 0");
- break;
- case BCH_DATA_stripe:
- break;
- }
-fsck_err:
- return ret;
-}
-
-void bch2_alloc_v4_swab(struct bkey_s k)
-{
- struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
-
- a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
- a->journal_seq_empty = swab64(a->journal_seq_empty);
- a->flags = swab32(a->flags);
- a->dirty_sectors = swab32(a->dirty_sectors);
- a->cached_sectors = swab32(a->cached_sectors);
- a->io_time[0] = swab64(a->io_time[0]);
- a->io_time[1] = swab64(a->io_time[1]);
- a->stripe = swab32(a->stripe);
- a->nr_external_backpointers = swab32(a->nr_external_backpointers);
- a->stripe_sectors = swab32(a->stripe_sectors);
-}
-
-static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
- unsigned dev, const struct bch_alloc_v4 *a)
-{
- struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
-
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
- bch2_prt_data_type(out, a->data_type);
- prt_newline(out);
- prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
- prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
- prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
- prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
- prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
- prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
- prt_printf(out, "stripe %u\n", a->stripe);
- prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
- prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
- prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
-
- if (ca)
- prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
- prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- printbuf_indent_sub(out, 2);
-
- bch2_dev_put(ca);
-}
-
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-
- __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
-}
-
-void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
-}
-
-void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
-{
- if (k.k->type == KEY_TYPE_alloc_v4) {
- void *src, *dst;
-
- *out = *bkey_s_c_to_alloc_v4(k).v;
-
- src = alloc_v4_backpointers(out);
- SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
- dst = alloc_v4_backpointers(out);
-
- if (src < dst)
- memset(src, 0, dst - src);
-
- SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
- } else {
- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
- *out = (struct bch_alloc_v4) {
- .journal_seq_nonempty = u.journal_seq,
- .flags = u.need_discard,
- .gen = u.gen,
- .oldest_gen = u.oldest_gen,
- .data_type = u.data_type,
- .stripe_redundancy = u.stripe_redundancy,
- .dirty_sectors = u.dirty_sectors,
- .cached_sectors = u.cached_sectors,
- .io_time[READ] = u.read_time,
- .io_time[WRITE] = u.write_time,
- .stripe = u.stripe,
- };
-
- SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
- }
-}
-
-static noinline struct bkey_i_alloc_v4 *
-__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bkey_i_alloc_v4 *ret;
-
- ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
- if (IS_ERR(ret))
- return ret;
-
- if (k.k->type == KEY_TYPE_alloc_v4) {
- void *src, *dst;
-
- bkey_reassemble(&ret->k_i, k);
-
- src = alloc_v4_backpointers(&ret->v);
- SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
- dst = alloc_v4_backpointers(&ret->v);
-
- if (src < dst)
- memset(src, 0, dst - src);
-
- SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
- set_alloc_v4_u64s(ret);
- } else {
- bkey_alloc_v4_init(&ret->k_i);
- ret->k.p = k.k->p;
- bch2_alloc_to_v4(k, &ret->v);
- }
- return ret;
-}
-
-static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bkey_s_c_alloc_v4 a;
-
- if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
- ((a = bkey_s_c_to_alloc_v4(k), true) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
- return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
-
- return __bch2_alloc_to_v4_mut(trans, k);
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
-{
- return bch2_alloc_to_v4_mut_inlined(trans, k);
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos)
-{
- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
- BTREE_ITER_with_updates|
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (unlikely(ret))
- return ERR_PTR(ret);
-
- struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (unlikely(ret))
- goto err;
- return a;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ERR_PTR(ret);
-}
-
-__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
- BTREE_ITER_with_updates|
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (unlikely(ret))
- return ERR_PTR(ret);
-
- if ((void *) k.v >= trans->mem &&
- (void *) k.v < trans->mem + trans->mem_top) {
- bch2_trans_iter_exit(trans, &iter);
- return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
- }
-
- struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
- if (IS_ERR(a)) {
- bch2_trans_iter_exit(trans, &iter);
- return a;
- }
-
- ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
- bch2_trans_iter_exit(trans, &iter);
- return unlikely(ret) ? ERR_PTR(ret) : a;
-}
-
-static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
-{
- *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
-
- pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
- return pos;
-}
-
-static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
-{
- pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
- pos.offset += offset;
- return pos;
-}
-
-static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
-{
- return k.k->type == KEY_TYPE_bucket_gens
- ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
- : 0;
-}
-
-int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
- c, bucket_gens_val_size_bad,
- "bad val size (%zu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
-fsck_err:
- return ret;
-}
-
-void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
- if (i)
- prt_char(out, ' ');
- prt_printf(out, "%u", g.v->gens[i]);
- }
-}
-
-int bch2_bucket_gens_init(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bkey_i_bucket_gens g;
- bool have_bucket_gens_key = false;
- int ret;
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- /*
- * Not a fsck error because this is checked/repaired by
- * bch2_check_alloc_key() which runs later:
- */
- if (!bch2_dev_bucket_exists(c, k.k->p))
- continue;
-
- struct bch_alloc_v4 a;
- u8 gen = bch2_alloc_to_v4(k, &a)->gen;
- unsigned offset;
- struct bpos pos = alloc_gens_pos(iter.pos, &offset);
- int ret2 = 0;
-
- if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
- ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret2)
- goto iter_err;
- have_bucket_gens_key = false;
- }
-
- if (!have_bucket_gens_key) {
- bkey_bucket_gens_init(&g.k_i);
- g.k.p = pos;
- have_bucket_gens_key = true;
- }
-
- g.v.gens[offset] = gen;
-iter_err:
- ret2;
- }));
-
- if (have_bucket_gens_key && !ret)
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-
- bch2_trans_put(trans);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
- down_read(&c->state_lock);
-
- struct btree_trans *trans = bch2_trans_get(c);
- struct bch_dev *ca = NULL;
- int ret;
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
- ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
- u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
-
- if (k.k->type != KEY_TYPE_bucket_gens)
- continue;
-
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- /*
- * Not a fsck error because this is checked/repaired by
- * bch2_check_alloc_key() which runs later:
- */
- if (!ca) {
- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
-
- for (u64 b = max_t(u64, ca->mi.first_bucket, start);
- b < min_t(u64, ca->mi.nbuckets, end);
- b++)
- *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
- 0;
- }));
- } else {
- ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- /*
- * Not a fsck error because this is checked/repaired by
- * bch2_check_alloc_key() which runs later:
- */
- if (!ca) {
- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- if (k.k->p.offset < ca->mi.first_bucket) {
- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
- continue;
- }
-
- if (k.k->p.offset >= ca->mi.nbuckets) {
- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- struct bch_alloc_v4 a;
- *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
- 0;
- }));
- }
-
- bch2_dev_put(ca);
- bch2_trans_put(trans);
-
- up_read(&c->state_lock);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* Free space/discard btree: */
-
-static int __need_discard_or_freespace_err(struct btree_trans *trans,
- struct bkey_s_c alloc_k,
- bool set, bool discard, bool repair)
-{
- struct bch_fs *c = trans->c;
- enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
- enum bch_sb_error_id err_id = discard
- ? BCH_FSCK_ERR_need_discard_key_wrong
- : BCH_FSCK_ERR_freespace_key_wrong;
- enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, alloc_k);
-
- int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
- "bucket incorrectly %sset in %s btree\n%s",
- set ? "" : "un",
- bch2_btree_id_str(btree),
- buf.buf);
- if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
- bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
- ret = 0;
-
- printbuf_exit(&buf);
- return ret;
-}
-
-#define need_discard_or_freespace_err(...) \
- fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
-
-#define need_discard_or_freespace_err_on(cond, ...) \
- (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false)
-
-static int bch2_bucket_do_index(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c alloc_k,
- const struct bch_alloc_v4 *a,
- bool set)
-{
- enum btree_id btree;
- struct bpos pos;
-
- if (a->data_type != BCH_DATA_free &&
- a->data_type != BCH_DATA_need_discard)
- return 0;
-
- switch (a->data_type) {
- case BCH_DATA_free:
- btree = BTREE_ID_freespace;
- pos = alloc_freespace_pos(alloc_k.k->p, *a);
- break;
- case BCH_DATA_need_discard:
- btree = BTREE_ID_need_discard;
- pos = alloc_k.k->p;
- break;
- default:
- return 0;
- }
-
- struct btree_iter iter;
- struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
- int ret = bkey_err(old);
- if (ret)
- return ret;
-
- need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
- !old.k->type != set,
- trans, alloc_k, set,
- btree == BTREE_ID_need_discard, false);
-
- ret = bch2_btree_bit_mod_iter(trans, &iter, set);
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
- struct bpos bucket, u8 gen)
-{
- struct btree_iter iter;
- unsigned offset;
- struct bpos pos = alloc_gens_pos(bucket, &offset);
- struct bkey_i_bucket_gens *g;
- struct bkey_s_c k;
- int ret;
-
- g = bch2_trans_kmalloc(trans, sizeof(*g));
- ret = PTR_ERR_OR_ZERO(g);
- if (ret)
- return ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
- BTREE_ITER_intent|
- BTREE_ITER_with_updates);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (k.k->type != KEY_TYPE_bucket_gens) {
- bkey_bucket_gens_init(&g->k_i);
- g->k.p = iter.pos;
- } else {
- bkey_reassemble(&g->k_i, k);
- }
-
- g->v.gens[offset] = gen;
-
- ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_data_type data_type,
- s64 delta_buckets,
- s64 delta_sectors,
- s64 delta_fragmented, unsigned flags)
-{
- s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
-
- return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- d, dev_data_type,
- .dev = ca->dev_idx,
- .data_type = data_type);
-}
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
- const struct bch_alloc_v4 *old,
- const struct bch_alloc_v4 *new,
- unsigned flags)
-{
- s64 old_sectors = bch2_bucket_sectors(*old);
- s64 new_sectors = bch2_bucket_sectors(*new);
- if (old->data_type != new->data_type) {
- int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
- 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
- bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
- -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
- if (ret)
- return ret;
- } else if (old_sectors != new_sectors) {
- int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
- 0,
- new_sectors - old_sectors,
- bch2_bucket_sectors_fragmented(ca, *new) -
- bch2_bucket_sectors_fragmented(ca, *old), flags);
- if (ret)
- return ret;
- }
-
- s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
- s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
- if (old_unstriped != new_unstriped) {
- int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
- !!new_unstriped - !!old_unstriped,
- new_unstriped - old_unstriped,
- 0,
- flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_trigger_alloc(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
- if (!ca)
- return bch_err_throw(c, trigger_alloc);
-
- struct bch_alloc_v4 old_a_convert;
- const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
-
- struct bch_alloc_v4 *new_a;
- if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
- new_a = bkey_s_to_alloc_v4(new).v;
- } else {
- BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
-
- struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
- ret = PTR_ERR_OR_ZERO(new_ka);
- if (unlikely(ret))
- goto err;
- new_a = &new_ka->v;
- }
-
- if (flags & BTREE_TRIGGER_transactional) {
- alloc_data_type_set(new_a, new_a->data_type);
-
- int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
- (int) data_type_is_empty(old_a->data_type);
-
- if (is_empty_delta < 0) {
- new_a->io_time[READ] = bch2_current_io_time(c, READ);
- new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
- }
-
- if (data_type_is_empty(new_a->data_type) &&
- BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
- !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
- if (new_a->oldest_gen == new_a->gen &&
- !bch2_bucket_sectors_total(*new_a))
- new_a->oldest_gen++;
- new_a->gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
- alloc_data_type_set(new_a, new_a->data_type);
- }
-
- if (old_a->data_type != new_a->data_type ||
- (new_a->data_type == BCH_DATA_free &&
- alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
- bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
- if (ret)
- goto err;
- }
-
- if (new_a->data_type == BCH_DATA_cached &&
- !new_a->io_time[READ])
- new_a->io_time[READ] = bch2_current_io_time(c, READ);
-
- ret = bch2_lru_change(trans, new.k->p.inode,
- bucket_to_u64(new.k->p),
- alloc_lru_idx_read(*old_a),
- alloc_lru_idx_read(*new_a));
- if (ret)
- goto err;
-
- ret = bch2_lru_change(trans,
- BCH_LRU_BUCKET_FRAGMENTATION,
- bucket_to_u64(new.k->p),
- alloc_lru_idx_fragmentation(*old_a, ca),
- alloc_lru_idx_fragmentation(*new_a, ca));
- if (ret)
- goto err;
-
- if (old_a->gen != new_a->gen) {
- ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
- if (ret)
- goto err;
- }
-
- ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
- if (ret)
- goto err;
- }
-
- if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
- u64 transaction_seq = trans->journal_res.seq;
- BUG_ON(!transaction_seq);
-
- if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
- trans, alloc_key_journal_seq_in_future,
- "bucket journal seq in future (currently at %llu)\n%s",
- journal_cur_seq(&c->journal),
- (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
- new_a->journal_seq_nonempty = transaction_seq;
-
- int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
- (int) data_type_is_empty(old_a->data_type);
-
- /*
- * Record journal sequence number of empty -> nonempty transition:
- * Note that there may be multiple empty -> nonempty
- * transitions, data in a bucket may be overwritten while we're
- * still writing to it - so be careful to only record the first:
- * */
- if (is_empty_delta < 0 &&
- new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
- new_a->journal_seq_nonempty = transaction_seq;
- new_a->journal_seq_empty = 0;
- }
-
- /*
- * Bucket becomes empty: mark it as waiting for a journal flush,
- * unless updates since empty -> nonempty transition were never
- * flushed - we may need to ask the journal not to flush
- * intermediate sequence numbers:
- */
- if (is_empty_delta > 0) {
- if (new_a->journal_seq_nonempty == transaction_seq ||
- bch2_journal_noflush_seq(&c->journal,
- new_a->journal_seq_nonempty,
- transaction_seq)) {
- new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
- } else {
- new_a->journal_seq_empty = transaction_seq;
-
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- transaction_seq);
- if (bch2_fs_fatal_err_on(ret, c,
- "setting bucket_needs_journal_commit: %s",
- bch2_err_str(ret)))
- goto err;
- }
- }
-
- if (new_a->gen != old_a->gen) {
- guard(rcu)();
- u8 *gen = bucket_gen(ca, new.k->p.offset);
- if (unlikely(!gen))
- goto invalid_bucket;
- *gen = new_a->gen;
- }
-
-#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
-#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
-#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
-
- if (statechange(a->data_type == BCH_DATA_free) &&
- bucket_flushed(new_a))
- closure_wake_up(&c->freelist_wait);
-
- if (statechange(a->data_type == BCH_DATA_need_discard) &&
- !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
- bucket_flushed(new_a))
- bch2_discard_one_bucket_fast(ca, new.k->p.offset);
-
- if (statechange(a->data_type == BCH_DATA_cached) &&
- !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_dev_do_invalidates(ca);
-
- if (statechange(a->data_type == BCH_DATA_need_gc_gens))
- bch2_gc_gens_async(c);
- }
-
- if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
- guard(rcu)();
- struct bucket *g = gc_bucket(ca, new.k->p.offset);
- if (unlikely(!g))
- goto invalid_bucket;
- g->gen_valid = 1;
- g->gen = new_a->gen;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch2_dev_put(ca);
- return ret;
-invalid_bucket:
- bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
- (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = bch_err_throw(c, trigger_alloc);
- goto err;
-}
-
-/*
- * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
- * extents style btrees, but works on non-extents btrees:
- */
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end, struct bkey *hole)
-{
- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
-
- if (bkey_err(k))
- return k;
-
- if (k.k->type) {
- return k;
- } else {
- struct btree_iter iter2;
- struct bpos next;
-
- bch2_trans_copy_iter(trans, &iter2, iter);
-
- struct btree_path *path = btree_iter_path(trans, iter);
- if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
- end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
-
- end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
-
- /*
- * btree node min/max is a closed interval, upto takes a half
- * open interval:
- */
- k = bch2_btree_iter_peek_max(trans, &iter2, end);
- next = iter2.pos;
- bch2_trans_iter_exit(trans, &iter2);
-
- BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
-
- if (bkey_err(k))
- return k;
-
- bkey_init(hole);
- hole->p = iter->pos;
-
- bch2_key_resize(hole, next.offset - iter->pos.offset);
- return (struct bkey_s_c) { hole, NULL };
- }
-}
-
-static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
-{
- if (*ca) {
- if (bucket->offset < (*ca)->mi.first_bucket)
- bucket->offset = (*ca)->mi.first_bucket;
-
- if (bucket->offset < (*ca)->mi.nbuckets)
- return true;
-
- bch2_dev_put(*ca);
- *ca = NULL;
- bucket->inode++;
- bucket->offset = 0;
- }
-
- guard(rcu)();
- *ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
- if (*ca) {
- *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
- bch2_dev_get(*ca);
- }
-
- return *ca != NULL;
-}
-
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_dev **ca, struct bkey *hole)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
-again:
- k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
- if (bkey_err(k))
- return k;
-
- *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
-
- if (!k.k->type) {
- struct bpos hole_start = bkey_start_pos(k.k);
-
- if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
- if (!next_bucket(c, ca, &hole_start))
- return bkey_s_c_null;
-
- bch2_btree_iter_set_pos(trans, iter, hole_start);
- goto again;
- }
-
- if (k.k->p.offset > (*ca)->mi.nbuckets)
- bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
- }
-
- return k;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_key(struct btree_trans *trans,
- struct bkey_s_c alloc_k,
- struct btree_iter *alloc_iter,
- struct btree_iter *discard_iter,
- struct btree_iter *freespace_iter,
- struct btree_iter *bucket_gens_iter)
-{
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- unsigned gens_offset;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
- if (fsck_err_on(!ca,
- trans, alloc_key_to_missing_dev_bucket,
- "alloc key for invalid device:bucket %llu:%llu",
- alloc_k.k->p.inode, alloc_k.k->p.offset))
- ret = bch2_btree_delete_at(trans, alloc_iter, 0);
- if (!ca)
- return ret;
-
- if (!ca->mi.freespace_initialized)
- goto out;
-
- a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
- k = bch2_btree_iter_peek_slot(trans, discard_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bool is_discarded = a->data_type == BCH_DATA_need_discard;
- if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
- trans, alloc_k, !is_discarded, true, true)) {
- ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
- if (ret)
- goto err;
- }
-
- bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
- k = bch2_btree_iter_peek_slot(trans, freespace_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bool is_free = a->data_type == BCH_DATA_free;
- if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
- trans, alloc_k, !is_free, false, true)) {
- ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
- if (ret)
- goto err;
- }
-
- bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
- k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
- trans, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n%s",
- alloc_gen(k, gens_offset), a->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- struct bkey_i_bucket_gens *g =
- bch2_trans_kmalloc(trans, sizeof(*g));
-
- ret = PTR_ERR_OR_ZERO(g);
- if (ret)
- goto err;
-
- if (k.k->type == KEY_TYPE_bucket_gens) {
- bkey_reassemble(&g->k_i, k);
- } else {
- bkey_bucket_gens_init(&g->k_i);
- g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
- }
-
- g->v.gens[gens_offset] = a->gen;
-
- ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos start,
- struct bpos *end,
- struct btree_iter *freespace_iter)
-{
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- if (!ca->mi.freespace_initialized)
- return 0;
-
- bch2_btree_iter_set_pos(trans, freespace_iter, start);
-
- k = bch2_btree_iter_peek_slot(trans, freespace_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- *end = bkey_min(k.k->p, *end);
-
- if (fsck_err_on(k.k->type != KEY_TYPE_set,
- trans, freespace_hole_missing,
- "hole in alloc btree missing in freespace btree\n"
- "device %llu buckets %llu-%llu",
- freespace_iter->pos.inode,
- freespace_iter->pos.offset,
- end->offset)) {
- struct bkey_i *update =
- bch2_trans_kmalloc(trans, sizeof(*update));
-
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_init(&update->k);
- update->k.type = KEY_TYPE_set;
- update->k.p = freespace_iter->pos;
- bch2_key_resize(&update->k,
- min_t(u64, U32_MAX, end->offset -
- freespace_iter->pos.offset));
-
- ret = bch2_trans_update(trans, freespace_iter, update, 0);
- if (ret)
- goto err;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static noinline_for_stack
-int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
- struct bpos start,
- struct bpos *end,
- struct btree_iter *bucket_gens_iter)
-{
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- unsigned i, gens_offset, gens_end_offset;
- int ret;
-
- bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
-
- k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
- alloc_gens_pos(*end, &gens_end_offset)))
- gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
-
- if (k.k->type == KEY_TYPE_bucket_gens) {
- struct bkey_i_bucket_gens g;
- bool need_update = false;
-
- bkey_reassemble(&g.k_i, k);
-
- for (i = gens_offset; i < gens_end_offset; i++) {
- if (fsck_err_on(g.v.gens[i], trans,
- bucket_gens_hole_wrong,
- "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
- bucket_gens_pos_to_alloc(k.k->p, i).inode,
- bucket_gens_pos_to_alloc(k.k->p, i).offset,
- g.v.gens[i])) {
- g.v.gens[i] = 0;
- need_update = true;
- }
- }
-
- if (need_update) {
- struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- memcpy(u, &g, sizeof(g));
-
- ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
- if (ret)
- goto err;
- }
- }
-
- *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-struct check_discard_freespace_key_async {
- struct work_struct work;
- struct bch_fs *c;
- struct bbpos pos;
-};
-
-static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- u8 gen;
- ret = k.k->type != KEY_TYPE_set
- ? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
- : 0;
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static void check_discard_freespace_key_work(struct work_struct *work)
-{
- struct check_discard_freespace_key_async *w =
- container_of(work, struct check_discard_freespace_key_async, work);
-
- bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
- enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
- kfree(w);
-}
-
-int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
- bool async_repair)
-{
- struct bch_fs *c = trans->c;
- enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
- ? BCH_DATA_need_discard
- : BCH_DATA_free;
- struct printbuf buf = PRINTBUF;
-
- unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
- FSCK_CAN_FIX|FSCK_CAN_IGNORE;
-
- struct bpos bucket = iter->pos;
- bucket.offset &= ~(~0ULL << 56);
- u64 genbits = iter->pos.offset & (~0ULL << 56);
-
- struct btree_iter alloc_iter;
- struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
- BTREE_ID_alloc, bucket,
- async_repair ? BTREE_ITER_cached : 0);
- int ret = bkey_err(alloc_k);
- if (ret)
- return ret;
-
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (__fsck_err(trans, fsck_flags,
- need_discard_freespace_key_to_invalid_dev_bucket,
- "entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
- goto delete;
- ret = 1;
- goto out;
- }
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- if (a->data_type != state ||
- (state == BCH_DATA_free &&
- genbits != alloc_freespace_genbits(*a))) {
- if (__fsck_err(trans, fsck_flags,
- need_discard_freespace_key_bad,
- "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
- bch2_btree_id_str(iter->btree_id),
- iter->pos.inode,
- iter->pos.offset,
- a->data_type == state,
- genbits >> 56, alloc_freespace_genbits(*a) >> 56))
- goto delete;
- ret = 1;
- goto out;
- }
-
- *gen = a->gen;
-out:
-fsck_err:
- bch2_set_btree_iter_dontneed(trans, &alloc_iter);
- bch2_trans_iter_exit(trans, &alloc_iter);
- printbuf_exit(&buf);
- return ret;
-delete:
- if (!async_repair) {
- ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc) ?:
- bch_err_throw(c, transaction_restart_commit);
- goto out;
- } else {
- /*
- * We can't repair here when called from the allocator path: the
- * commit will recurse back into the allocator
- */
- struct check_discard_freespace_key_async *w =
- kzalloc(sizeof(*w), GFP_KERNEL);
- if (!w)
- goto out;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
- kfree(w);
- goto out;
- }
-
- INIT_WORK(&w->work, check_discard_freespace_key_work);
- w->c = c;
- w->pos = BBPOS(iter->btree_id, iter->pos);
- queue_work(c->write_ref_wq, &w->work);
-
- ret = 1; /* don't allocate from this bucket */
- goto out;
- }
-}
-
-static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
-{
- u8 gen;
- int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
- return ret < 0 ? ret : 0;
-}
-
-/*
- * We've already checked that generation numbers in the bucket_gens btree are
- * valid for buckets that exist; this just checks for keys for nonexistent
- * buckets.
- */
-static noinline_for_stack
-int bch2_check_bucket_gens_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_bucket_gens g;
- u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
- u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
- u64 b;
- bool need_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
- bkey_reassemble(&g.k_i, k);
-
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
- if (!ca) {
- if (fsck_err(trans, bucket_gens_to_invalid_dev,
- "bucket_gens key for invalid device:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto out;
- }
-
- if (fsck_err_on(end <= ca->mi.first_bucket ||
- start >= ca->mi.nbuckets,
- trans, bucket_gens_to_invalid_buckets,
- "bucket_gens key for invalid buckets:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto out;
- }
-
- for (b = start; b < ca->mi.first_bucket; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
- trans, bucket_gens_nonzero_for_invalid_buckets,
- "bucket_gens key has nonzero gen for invalid bucket")) {
- g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
- need_update = true;
- }
-
- for (b = ca->mi.nbuckets; b < end; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
- trans, bucket_gens_nonzero_for_invalid_buckets,
- "bucket_gens key has nonzero gen for invalid bucket")) {
- g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
- need_update = true;
- }
-
- if (need_update) {
- struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
-
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto out;
-
- memcpy(u, &g, sizeof(g));
- ret = bch2_trans_update(trans, iter, u, 0);
- }
-out:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_alloc_info(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
- struct bch_dev *ca = NULL;
- struct bkey hole;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_prefetch);
-
- while (1) {
- struct bpos next;
-
- bch2_trans_begin(trans);
-
- k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
- ret = bkey_err(k);
- if (ret)
- goto bkey_err;
-
- if (!k.k)
- break;
-
- if (k.k->type) {
- next = bpos_nosnap_successor(k.k->p);
-
- ret = bch2_check_alloc_key(trans,
- k, &iter,
- &discard_iter,
- &freespace_iter,
- &bucket_gens_iter);
- if (ret)
- goto bkey_err;
- } else {
- next = k.k->p;
-
- ret = bch2_check_alloc_hole_freespace(trans, ca,
- bkey_start_pos(k.k),
- &next,
- &freespace_iter) ?:
- bch2_check_alloc_hole_bucket_gens(trans,
- bkey_start_pos(k.k),
- &next,
- &bucket_gens_iter);
- if (ret)
- goto bkey_err;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_set_pos(trans, &iter, next);
-bkey_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &bucket_gens_iter);
- bch2_trans_iter_exit(trans, &freespace_iter);
- bch2_trans_iter_exit(trans, &discard_iter);
- bch2_trans_iter_exit(trans, &iter);
- bch2_dev_put(ca);
- ca = NULL;
-
- if (ret < 0)
- goto err;
-
- ret = for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN,
- BTREE_ITER_prefetch, k,
- bch2_check_discard_freespace_key_fsck(trans, &iter));
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_prefetch);
- while (1) {
- bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(trans, &iter);
- if (!k.k)
- break;
-
- ret = bkey_err(k) ?:
- bch2_check_discard_freespace_key_fsck(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err(c, "while checking %s", buf.buf);
- printbuf_exit(&buf);
- break;
- }
-
- bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
- }
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_bucket_gens_key(trans, &iter, k));
-err:
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
- struct btree_iter *alloc_iter,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- struct bkey_s_c alloc_k;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
- if (!alloc_k.k)
- return 0;
-
- ret = bkey_err(alloc_k);
- if (ret)
- return ret;
-
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
- if (!ca)
- return 0;
-
- a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
- if (lru_idx) {
- ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
- bucket_to_u64(alloc_k.k->p),
- lru_idx, alloc_k, last_flushed);
- if (ret)
- goto err;
- }
-
- if (a->data_type != BCH_DATA_cached)
- goto err;
-
- if (fsck_err_on(!a->io_time[READ],
- trans, alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time 0\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- struct bkey_i_alloc_v4 *a_mut =
- bch2_alloc_to_v4_mut(trans, alloc_k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- goto err;
-
- a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
- ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_norun);
- if (ret)
- goto err;
-
- a = &a_mut->v;
- }
-
- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ],
- alloc_k, last_flushed);
- if (ret)
- goto err;
-err:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
- bch2_check_stripe_to_lru_refs(c);
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
-{
- struct bch_fs *c = ca->fs;
- int ret;
-
- mutex_lock(&ca->discard_buckets_in_flight_lock);
- struct discard_in_flight *i =
- darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
- if (i) {
- ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
- goto out;
- }
-
- ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
- .in_progress = in_progress,
- .bucket = bucket,
- }));
-out:
- mutex_unlock(&ca->discard_buckets_in_flight_lock);
- return ret;
-}
-
-static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
-{
- mutex_lock(&ca->discard_buckets_in_flight_lock);
- struct discard_in_flight *i =
- darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
- BUG_ON(!i || !i->in_progress);
-
- darray_remove_item(&ca->discard_buckets_in_flight, i);
- mutex_unlock(&ca->discard_buckets_in_flight_lock);
-}
-
-struct discard_buckets_state {
- u64 seen;
- u64 open;
- u64 need_journal_commit;
- u64 discarded;
-};
-
-static int bch2_discard_one_bucket(struct btree_trans *trans,
- struct bch_dev *ca,
- struct btree_iter *need_discard_iter,
- struct bpos *discard_pos_done,
- struct discard_buckets_state *s,
- bool fastpath)
-{
- struct bch_fs *c = trans->c;
- struct bpos pos = need_discard_iter->pos;
- struct btree_iter iter = {};
- struct bkey_s_c k;
- struct bkey_i_alloc_v4 *a;
- struct printbuf buf = PRINTBUF;
- bool discard_locked = false;
- int ret = 0;
-
- if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- s->open++;
- goto out;
- }
-
- u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
- pos.inode, pos.offset);
- if (seq_ready > c->journal.flushed_seq_ondisk) {
- if (seq_ready > c->journal.flushing_seq)
- s->need_journal_commit++;
- goto out;
- }
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- need_discard_iter->pos,
- BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- a = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto out;
-
- if (a->v.data_type != BCH_DATA_need_discard) {
- if (need_discard_or_freespace_err(trans, k, true, true, true)) {
- ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
- if (ret)
- goto out;
- goto commit;
- }
-
- goto out;
- }
-
- if (!fastpath) {
- if (discard_in_flight_add(ca, iter.pos.offset, true))
- goto out;
-
- discard_locked = true;
- }
-
- if (!bkey_eq(*discard_pos_done, iter.pos)) {
- s->discarded++;
- *discard_pos_done = iter.pos;
-
- if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
- /*
- * This works without any other locks because this is the only
- * thread that removes items from the need_discard tree
- */
- bch2_trans_unlock_long(trans);
- blkdev_issue_discard(ca->disk_sb.bdev,
- k.k->p.offset * ca->mi.bucket_size,
- ca->mi.bucket_size,
- GFP_KERNEL);
- ret = bch2_trans_relock_notrace(trans);
- if (ret)
- goto out;
- }
- }
-
- SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- alloc_data_type_set(&a->v, a->v.data_type);
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto out;
-commit:
- ret = bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto out;
-
- if (!fastpath)
- count_event(c, bucket_discard);
- else
- count_event(c, bucket_discard_fast);
-out:
-fsck_err:
- if (discard_locked)
- discard_in_flight_remove(ca, iter.pos.offset);
- if (!ret)
- s->seen++;
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static void bch2_do_discards_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
- struct bch_fs *c = ca->fs;
- struct discard_buckets_state s = {};
- struct bpos discard_pos_done = POS_MAX;
- int ret;
-
- /*
- * We're doing the commit in bch2_discard_one_bucket instead of using
- * for_each_btree_key_commit() so that we can increment counters after
- * successful commit:
- */
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter,
- BTREE_ID_need_discard,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx, U64_MAX), 0, k,
- bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
-
- if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
- bch2_journal_flush_async(&c->journal, NULL);
-
- trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
- bch2_err_str(ret));
-
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
-}
-
-void bch2_dev_do_discards(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
- return;
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
- goto put_write_ref;
-
- if (queue_work(c->write_ref_wq, &ca->discard_work))
- return;
-
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
-put_write_ref:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
-}
-
-void bch2_do_discards(struct bch_fs *c)
-{
- for_each_member_device(c, ca)
- bch2_dev_do_discards(ca);
-}
-
-static int bch2_do_discards_fast_one(struct btree_trans *trans,
- struct bch_dev *ca,
- u64 bucket,
- struct bpos *discard_pos_done,
- struct discard_buckets_state *s)
-{
- struct btree_iter need_discard_iter;
- struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
- BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
- int ret = bkey_err(discard_k);
- if (ret)
- return ret;
-
- if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
- trans, discarding_bucket_not_in_need_discard_btree,
- "attempting to discard bucket %u:%llu not in need_discard btree",
- ca->dev_idx, bucket))
- goto out;
-
- ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &need_discard_iter);
- return ret;
-}
-
-static void bch2_do_discards_fast_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
- struct bch_fs *c = ca->fs;
- struct discard_buckets_state s = {};
- struct bpos discard_pos_done = POS_MAX;
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
-
- while (1) {
- bool got_bucket = false;
- u64 bucket;
-
- mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i) {
- if (i->in_progress)
- continue;
-
- got_bucket = true;
- bucket = i->bucket;
- i->in_progress = true;
- break;
- }
- mutex_unlock(&ca->discard_buckets_in_flight_lock);
-
- if (!got_bucket)
- break;
-
- ret = lockrestart_do(trans,
- bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
- bch_err_fn(c, ret);
-
- discard_in_flight_remove(ca, bucket);
-
- if (ret)
- break;
- }
-
- trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
-
- bch2_trans_put(trans);
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
-}
-
-static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
-{
- struct bch_fs *c = ca->fs;
-
- if (discard_in_flight_add(ca, bucket, false))
- return;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
- return;
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
- goto put_ref;
-
- if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
- return;
-
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
-put_ref:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
-}
-
-static int invalidate_one_bp(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c_backpointer bp,
- struct bkey_buf *last_flushed)
-{
- struct btree_iter extent_iter;
- struct bkey_s_c extent_k =
- bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
- int ret = bkey_err(extent_k);
- if (ret)
- return ret;
-
- if (!extent_k.k)
- return 0;
-
- struct bkey_i *n =
- bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
- BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
-err:
- bch2_trans_iter_exit(trans, &extent_iter);
- return ret;
-}
-
-static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket,
- u8 gen,
- struct bkey_buf *last_flushed)
-{
- struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
- struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
-
- return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
- bp_start, bp_end, 0, k,
- NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc, ({
- if (k.k->type != KEY_TYPE_backpointer)
- continue;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
- if (bp.v->bucket_gen != gen)
- continue;
-
- /* filter out bps with gens that don't match */
-
- invalidate_one_bp(trans, ca, bp, last_flushed);
- }));
-}
-
-noinline_for_stack
-static int invalidate_one_bucket(struct btree_trans *trans,
- struct bch_dev *ca,
- struct btree_iter *lru_iter,
- struct bkey_s_c lru_k,
- struct bkey_buf *last_flushed,
- s64 *nr_to_invalidate)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
- struct btree_iter alloc_iter = {};
- int ret = 0;
-
- if (*nr_to_invalidate <= 0)
- return 1;
-
- if (!bch2_dev_bucket_exists(c, bucket)) {
- if (fsck_err(trans, lru_entry_to_invalid_bucket,
- "lru key points to nonexistent device:bucket %llu:%llu",
- bucket.inode, bucket.offset))
- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
- goto out;
- }
-
- if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
- return 0;
-
- struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
- BTREE_ID_alloc, bucket,
- BTREE_ITER_cached);
- ret = bkey_err(alloc_k);
- if (ret)
- return ret;
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
-
- /* We expect harmless races here due to the btree write buffer: */
- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
- goto out;
-
- /*
- * Impossible since alloc_lru_idx_read() only returns nonzero if the
- * bucket is supposed to be on the cached bucket LRU (i.e.
- * BCH_DATA_cached)
- *
- * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
- */
- BUG_ON(a->data_type != BCH_DATA_cached);
- BUG_ON(a->dirty_sectors);
-
- if (!a->cached_sectors) {
- bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
- true, last_flushed);
- goto out;
- }
-
- unsigned cached_sectors = a->cached_sectors;
- u8 gen = a->gen;
-
- ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
- if (ret)
- goto out;
-
- trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
- --*nr_to_invalidate;
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &alloc_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
- struct bch_dev *ca, bool *wrapped)
-{
- struct bkey_s_c k;
-again:
- k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
- if (!k.k && !*wrapped) {
- bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
- *wrapped = true;
- goto again;
- }
-
- return k;
-}
-
-static void bch2_do_invalidates_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
- struct bch_fs *c = ca->fs;
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
-
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (ret)
- goto err;
-
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- struct btree_iter iter;
- bool wrapped = false;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0,
- ((bch2_current_io_time(c, READ) + U32_MAX) &
- LRU_TIME_MAX)), 0);
-
- while (true) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
- ret = bkey_err(k);
- if (ret)
- goto restart_err;
- if (!k.k)
- break;
-
- ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
-restart_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_btree_iter_advance(trans, &iter);
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&last_flushed, c);
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
- return;
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
- goto put_ref;
-
- if (queue_work(c->write_ref_wq, &ca->invalidate_work))
- return;
-
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
-put_ref:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
-}
-
-void bch2_do_invalidates(struct bch_fs *c)
-{
- for_each_member_device(c, ca)
- bch2_dev_do_invalidates(ca);
-}
-
-int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
- u64 bucket_start, u64 bucket_end)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey hole;
- struct bpos end = POS(ca->dev_idx, bucket_end);
- struct bch_member *m;
- unsigned long last_updated = jiffies;
- int ret;
-
- BUG_ON(bucket_start > bucket_end);
- BUG_ON(bucket_end > ca->mi.nbuckets);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
- BTREE_ITER_prefetch);
- /*
- * Scan the alloc btree for every bucket on @ca, and add buckets to the
- * freespace/need_discard/need_gc_gens btrees as needed:
- */
- while (1) {
- if (time_after(jiffies, last_updated + HZ * 10)) {
- bch_info(ca, "%s: currently at %llu/%llu",
- __func__, iter.pos.offset, ca->mi.nbuckets);
- last_updated = jiffies;
- }
-
- bch2_trans_begin(trans);
-
- if (bkey_ge(iter.pos, end)) {
- ret = 0;
- break;
- }
-
- k = bch2_get_key_or_hole(trans, &iter, end, &hole);
- ret = bkey_err(k);
- if (ret)
- goto bkey_err;
-
- if (k.k->type) {
- /*
- * We process live keys in the alloc btree one at a
- * time:
- */
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
- ret = bch2_bucket_do_index(trans, ca, k, a, true) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_advance(trans, &iter);
- } else {
- struct bkey_i *freespace;
-
- freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
- ret = PTR_ERR_OR_ZERO(freespace);
- if (ret)
- goto bkey_err;
-
- bkey_init(&freespace->k);
- freespace->k.type = KEY_TYPE_set;
- freespace->k.p = k.k->p;
- freespace->k.size = k.k->size;
-
- ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_set_pos(trans, &iter, k.k->p);
- }
-bkey_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
-
- if (ret < 0) {
- bch_err_msg(ca, ret, "initializing free space");
- return ret;
- }
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-int bch2_fs_freespace_init(struct bch_fs *c)
-{
- if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
- return 0;
-
-
- /*
- * We can crash during the device add path, so we need to check this on
- * every mount:
- */
-
- bool doing_init = false;
- for_each_member_device(c, ca) {
- if (ca->mi.freespace_initialized)
- continue;
-
- if (!doing_init) {
- bch_info(c, "initializing freespace");
- doing_init = true;
- }
-
- int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
- if (ret) {
- bch2_dev_put(ca);
- bch_err_fn(c, ret);
- return ret;
- }
- }
-
- if (doing_init) {
- mutex_lock(&c->sb_lock);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- bch_verbose(c, "done initializing freespace");
- }
-
- return 0;
-}
-
-/* device removal */
-
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bpos start = POS(ca->dev_idx, 0);
- struct bpos end = POS(ca->dev_idx, U64_MAX);
- int ret;
-
- /*
- * We clear the LRU and need_discard btrees first so that we don't race
- * with bch2_do_invalidates() and bch2_do_discards()
- */
- ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_dev_usage_remove(c, ca->dev_idx);
- bch_err_msg(ca, ret, "removing dev alloc info");
- return ret;
-}
-
-/* Bucket IO clocks: */
-
-static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
-{
- struct bch_fs *c = trans->c;
-
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
-
- u64 now = bch2_current_io_time(c, rw);
- if (a->v.io_time[rw] == now)
- goto out;
-
- a->v.io_time[rw] = now;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
-{
- if (bch2_trans_relock(trans))
- bch2_trans_begin(trans);
-
- return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch2_recalc_capacity(struct bch_fs *c)
-{
- u64 capacity = 0, reserved_sectors = 0, gc_reserve;
- unsigned bucket_size_max = 0;
- unsigned long ra_pages = 0;
-
- lockdep_assert_held(&c->state_lock);
-
- guard(rcu)();
- for_each_member_device_rcu(c, ca, NULL) {
- struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
- if (bdev)
- ra_pages += bdev->bd_disk->bdi->ra_pages;
-
- if (ca->mi.state != BCH_MEMBER_STATE_rw)
- continue;
-
- u64 dev_reserve = 0;
-
- /*
- * We need to reserve buckets (from the number
- * of currently available buckets) against
- * foreground writes so that mainly copygc can
- * make forward progress.
- *
- * We need enough to refill the various reserves
- * from scratch - copygc will use its entire
- * reserve all at once, then run against when
- * its reserve is refilled (from the formerly
- * available buckets).
- *
- * This reserve is just used when considering if
- * allocations for foreground writes must wait -
- * not -ENOSPC calculations.
- */
-
- dev_reserve += ca->nr_btree_reserve * 2;
- dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
-
- dev_reserve += 1; /* btree write point */
- dev_reserve += 1; /* copygc write point */
- dev_reserve += 1; /* rebalance write point */
-
- dev_reserve *= ca->mi.bucket_size;
-
- capacity += bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket);
-
- reserved_sectors += dev_reserve * 2;
-
- bucket_size_max = max_t(unsigned, bucket_size_max,
- ca->mi.bucket_size);
- }
-
- bch2_set_ra_pages(c, ra_pages);
-
- gc_reserve = c->opts.gc_reserve_bytes
- ? c->opts.gc_reserve_bytes >> 9
- : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
-
- reserved_sectors = max(gc_reserve, reserved_sectors);
-
- reserved_sectors = min(reserved_sectors, capacity);
-
- c->reserved = reserved_sectors;
- c->capacity = capacity - reserved_sectors;
-
- c->bucket_size_max = bucket_size_max;
-
- /* Wake up case someone was waiting for buckets */
- closure_wake_up(&c->freelist_wait);
-}
-
-u64 bch2_min_rw_member_capacity(struct bch_fs *c)
-{
- u64 ret = U64_MAX;
-
- guard(rcu)();
- for_each_rw_member_rcu(c, ca)
- ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
- return ret;
-}
-
-static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
-{
- struct open_bucket *ob;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- scoped_guard(spinlock, &ob->lock) {
- if (ob->valid && !ob->on_partial_list &&
- ob->dev == ca->dev_idx)
- return true;
- }
- }
-
- return false;
-}
-
-void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
-{
- /* BCH_DATA_free == all rw devs */
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- if (rw &&
- (i == BCH_DATA_free ||
- (ca->mi.data_allowed & BIT(i))))
- set_bit(ca->dev_idx, c->rw_devs[i].d);
- else
- clear_bit(ca->dev_idx, c->rw_devs[i].d);
-}
-
-/* device goes ro: */
-void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- /* First, remove device from allocation groups: */
- bch2_dev_allocator_set_rw(c, ca, false);
-
- c->rw_devs_change_count++;
-
- /*
- * Capacity is calculated based off of devices in allocation groups:
- */
- bch2_recalc_capacity(c);
-
- bch2_open_buckets_stop(c, ca, false);
-
- /*
- * Wake up threads that were blocked on allocation, so they can notice
- * the device can no longer be removed and the capacity has changed:
- */
- closure_wake_up(&c->freelist_wait);
-
- /*
- * journal_res_get() can block waiting for free space in the journal -
- * it needs to notice there may not be devices to allocate from anymore:
- */
- wake_up(&c->journal.wait);
-
- /* Now wait for any in flight writes: */
-
- closure_wait_event(&c->open_buckets_wait,
- !bch2_dev_has_open_write_point(c, ca));
-}
-
-/* device goes rw: */
-void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- bch2_dev_allocator_set_rw(c, ca, true);
- c->rw_devs_change_count++;
-}
-
-void bch2_dev_allocator_background_exit(struct bch_dev *ca)
-{
- darray_exit(&ca->discard_buckets_in_flight);
-}
-
-void bch2_dev_allocator_background_init(struct bch_dev *ca)
-{
- mutex_init(&ca->discard_buckets_in_flight_lock);
- INIT_WORK(&ca->discard_work, bch2_do_discards_work);
- INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
- INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
-}
-
-void bch2_fs_allocator_background_init(struct bch_fs *c)
-{
- spin_lock_init(&c->freelist_lock);
-}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
deleted file mode 100644
index 0cc5adc55b6f..000000000000
--- a/fs/bcachefs/alloc_background.h
+++ /dev/null
@@ -1,361 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
-#define _BCACHEFS_ALLOC_BACKGROUND_H
-
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "debug.h"
-#include "super.h"
-
-/* How out of date a pointer gen is allowed to be: */
-#define BUCKET_GC_GEN_MAX 96U
-
-static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
-{
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- return ca && bucket_valid(ca, pos.offset);
-}
-
-static inline u64 bucket_to_u64(struct bpos bucket)
-{
- return (bucket.inode << 48) | bucket.offset;
-}
-
-static inline struct bpos u64_to_bucket(u64 bucket)
-{
- return POS(bucket >> 48, bucket & ~(~0ULL << 48));
-}
-
-static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
-{
- return a.gen - a.oldest_gen;
-}
-
-static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
-{
- dst->gen = src.gen;
- dst->data_type = src.data_type;
- dst->stripe_sectors = src.stripe_sectors;
- dst->dirty_sectors = src.dirty_sectors;
- dst->cached_sectors = src.cached_sectors;
- dst->stripe = src.stripe;
-}
-
-static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src)
-{
- dst->gen = src.gen;
- dst->data_type = src.data_type;
- dst->stripe_sectors = src.stripe_sectors;
- dst->dirty_sectors = src.dirty_sectors;
- dst->cached_sectors = src.cached_sectors;
- dst->stripe = src.stripe;
-}
-
-static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
-{
- struct bch_alloc_v4 ret = {};
- __bucket_m_to_alloc(&ret, b);
- return ret;
-}
-
-static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
-{
- switch (data_type) {
- case BCH_DATA_cached:
- case BCH_DATA_stripe:
- return BCH_DATA_user;
- default:
- return data_type;
- }
-}
-
-static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
- enum bch_data_type ptr)
-{
- return !data_type_is_empty(bucket) &&
- bucket_data_type(bucket) != bucket_data_type(ptr);
-}
-
-/*
- * It is my general preference to use unsigned types for unsigned quantities -
- * however, these helpers are used in disk accounting calculations run by
- * triggers where the output will be negated and added to an s64. unsigned is
- * right out even though all these quantities will fit in 32 bits, since it
- * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
- * simpler option here.
- */
-static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
-{
- return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
-}
-
-static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
-{
- return a.stripe_sectors + a.dirty_sectors;
-}
-
-static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
-{
- return a.data_type == BCH_DATA_cached
- ? a.cached_sectors
- : bch2_bucket_sectors_dirty(a);
-}
-
-static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
- struct bch_alloc_v4 a)
-{
- int d = bch2_bucket_sectors(a);
-
- return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
-{
- int d = a.stripe_sectors + a.dirty_sectors;
-
- return d ? max(0, ca->mi.bucket_size - d) : 0;
-}
-
-static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
-{
- return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
-}
-
-static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
- enum bch_data_type data_type)
-{
- if (a.stripe)
- return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
- if (bch2_bucket_sectors_dirty(a))
- return bucket_data_type(data_type);
- if (a.cached_sectors)
- return BCH_DATA_cached;
- if (BCH_ALLOC_V4_NEED_DISCARD(&a))
- return BCH_DATA_need_discard;
- if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
- return BCH_DATA_need_gc_gens;
- return BCH_DATA_free;
-}
-
-static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type)
-{
- a->data_type = alloc_data_type(*a, data_type);
-}
-
-static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
-{
- return a.data_type == BCH_DATA_cached
- ? a.io_time[READ] & LRU_TIME_MAX
- : 0;
-}
-
-#define DATA_TYPES_MOVABLE \
- ((1U << BCH_DATA_btree)| \
- (1U << BCH_DATA_user)| \
- (1U << BCH_DATA_stripe))
-
-static inline bool data_type_movable(enum bch_data_type type)
-{
- return (1U << type) & DATA_TYPES_MOVABLE;
-}
-
-static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
- struct bch_dev *ca)
-{
- if (a.data_type >= BCH_DATA_NR)
- return 0;
-
- if (!data_type_movable(a.data_type) ||
- !bch2_bucket_sectors_fragmented(ca, a))
- return 0;
-
- /*
- * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
- * bucket_sectors_dirty is (much) bigger than bucket_size
- */
- u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
- ca->mi.bucket_size);
-
- return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
-}
-
-static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
-{
- return ((u64) alloc_gc_gen(a) >> 4) << 56;
-}
-
-static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
-{
- pos.offset |= alloc_freespace_genbits(a);
- return pos;
-}
-
-static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a)
-{
- return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
- BCH_ALLOC_V4_U64s_V0) +
- BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
- (sizeof(struct bch_backpointer) / sizeof(u64));
-}
-
-static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
-{
- unsigned ret = alloc_v4_u64s_noerror(a);
- BUG_ON(ret > U8_MAX - BKEY_U64s);
- return ret;
-}
-
-static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
-{
- set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
-}
-
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
- enum btree_iter_update_trigger_flags);
-
-void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
-
-static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
-{
- const struct bch_alloc_v4 *ret;
-
- if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
- goto slowpath;
-
- ret = bkey_s_c_to_alloc_v4(k).v;
- if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
- goto slowpath;
-
- return ret;
-slowpath:
- __bch2_alloc_to_v4(k, convert);
- return convert;
-}
-
-struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
-
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-
-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_alloc_v4_swab(struct bkey_s);
-void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v1_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 8, \
-})
-
-#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v2_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 8, \
-})
-
-#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v3_validate, \
- .val_to_text = bch2_alloc_to_text, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 16, \
-})
-
-#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
- .key_validate = bch2_alloc_v4_validate, \
- .val_to_text = bch2_alloc_v4_to_text, \
- .swab = bch2_alloc_v4_swab, \
- .trigger = bch2_trigger_alloc, \
- .min_val_size = 48, \
-})
-
-int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
- .key_validate = bch2_bucket_gens_validate, \
- .val_to_text = bch2_bucket_gens_to_text, \
-})
-
-int bch2_bucket_gens_init(struct bch_fs *);
-
-static inline bool bkey_is_alloc(const struct bkey *k)
-{
- return k->type == KEY_TYPE_alloc ||
- k->type == KEY_TYPE_alloc_v2 ||
- k->type == KEY_TYPE_alloc_v3;
-}
-
-int bch2_alloc_read(struct bch_fs *);
-
-int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
- const struct bch_alloc_v4 *,
- const struct bch_alloc_v4 *, unsigned);
-int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
-int bch2_check_alloc_info(struct bch_fs *);
-int bch2_check_alloc_to_lru_refs(struct bch_fs *);
-void bch2_dev_do_discards(struct bch_dev *);
-void bch2_do_discards(struct bch_fs *);
-
-static inline u64 should_invalidate_buckets(struct bch_dev *ca,
- struct bch_dev_usage u)
-{
- u64 want_free = ca->mi.nbuckets >> 7;
- u64 free = max_t(s64, 0,
- u.buckets[BCH_DATA_free]
- + u.buckets[BCH_DATA_need_discard]
- - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
-
- return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
-}
-
-void bch2_dev_do_invalidates(struct bch_dev *);
-void bch2_do_invalidates(struct bch_fs *);
-
-static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
- return (void *) ((u64 *) &a->v +
- (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
- BCH_ALLOC_V4_U64s_V0));
-}
-
-static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
-{
- return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
-}
-
-int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
-int bch2_fs_freespace_init(struct bch_fs *);
-int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
-
-void bch2_recalc_capacity(struct bch_fs *);
-u64 bch2_min_rw_member_capacity(struct bch_fs *);
-
-void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool);
-void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-
-void bch2_dev_allocator_background_exit(struct bch_dev *);
-void bch2_dev_allocator_background_init(struct bch_dev *);
-
-void bch2_fs_allocator_background_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
deleted file mode 100644
index 740238369a5a..000000000000
--- a/fs/bcachefs/alloc_background_format.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq_nonempty;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- /* end of fields in original version of alloc_v4 */
- __u64 journal_seq_empty;
- __u32 stripe_sectors;
- __u32 pad;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
deleted file mode 100644
index b58525ec7b4d..000000000000
--- a/fs/bcachefs/alloc_foreground.c
+++ /dev/null
@@ -1,1683 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2012 Google, Inc.
- *
- * Foreground allocator code: allocate buckets from freelist, and allocate in
- * sector granularity from writepoints.
- *
- * bch2_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch2_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "io_write.h"
-#include "journal.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "trace.h"
-
-#include <linux/math64.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
- struct mutex *lock)
-{
- if (!mutex_trylock(lock)) {
- bch2_trans_unlock(trans);
- mutex_lock(lock);
- }
-}
-
-const char * const bch2_watermarks[] = {
-#define x(t) #t,
- BCH_WATERMARKS()
-#undef x
- NULL
-};
-
-/*
- * Open buckets represent a bucket that's currently being allocated from. They
- * serve two purposes:
- *
- * - They track buckets that have been partially allocated, allowing for
- * sub-bucket sized allocations - they're used by the sector allocator below
- *
- * - They provide a reference to the buckets they own that mark and sweep GC
- * can find, until the new allocation has a pointer to it inserted into the
- * btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-void bch2_reset_alloc_cursors(struct bch_fs *c)
-{
- guard(rcu)();
- for_each_member_device_rcu(c, ca, NULL)
- memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
-}
-
-static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
-{
- open_bucket_idx_t idx = ob - c->open_buckets;
- open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
- ob->hash = *slot;
- *slot = idx;
-}
-
-static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
-{
- open_bucket_idx_t idx = ob - c->open_buckets;
- open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
-
- while (*slot != idx) {
- BUG_ON(!*slot);
- slot = &c->open_buckets[*slot].hash;
- }
-
- *slot = ob->hash;
- ob->hash = 0;
-}
-
-void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- if (ob->ec) {
- ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
- return;
- }
-
- spin_lock(&ob->lock);
- ob->valid = false;
- ob->data_type = 0;
- spin_unlock(&ob->lock);
-
- spin_lock(&c->freelist_lock);
- bch2_open_bucket_hash_remove(c, ob);
-
- ob->freelist = c->open_buckets_freelist;
- c->open_buckets_freelist = ob - c->open_buckets;
-
- c->open_buckets_nr_free++;
- ca->nr_open_buckets--;
- spin_unlock(&c->freelist_lock);
-
- closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *c,
- struct open_buckets *obs,
- unsigned dev, int err)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, obs, ob, i)
- if (ob->dev == dev && ob->ec)
- bch2_ec_bucket_cancel(c, ob, err);
-}
-
-static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
-{
- struct open_bucket *ob;
-
- BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
-
- ob = c->open_buckets + c->open_buckets_freelist;
- c->open_buckets_freelist = ob->freelist;
- atomic_set(&ob->pin, 1);
- ob->data_type = 0;
-
- c->open_buckets_nr_free--;
- return ob;
-}
-
-static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
- return false;
-
- return bch2_is_superblock_bucket(ca, b);
-}
-
-static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
-{
- BUG_ON(c->open_buckets_partial_nr >=
- ARRAY_SIZE(c->open_buckets_partial));
-
- spin_lock(&c->freelist_lock);
- scoped_guard(rcu)
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
-
- ob->on_partial_list = true;
- c->open_buckets_partial[c->open_buckets_partial_nr++] =
- ob - c->open_buckets;
- spin_unlock(&c->freelist_lock);
-
- closure_wake_up(&c->open_buckets_wait);
- closure_wake_up(&c->freelist_wait);
-}
-
-static inline bool may_alloc_bucket(struct bch_fs *c,
- struct alloc_request *req,
- struct bpos bucket)
-{
- if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
- req->counters.skipped_open++;
- return false;
- }
-
- u64 journal_seq_ready =
- bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
- bucket.inode, bucket.offset);
- if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
- if (journal_seq_ready > c->journal.flushing_seq)
- req->counters.need_journal_commit++;
- req->counters.skipped_need_journal_commit++;
- return false;
- }
-
- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
- req->counters.skipped_nocow++;
- return false;
- }
-
- return true;
-}
-
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
- struct alloc_request *req,
- u64 bucket, u8 gen,
- struct closure *cl)
-{
- struct bch_dev *ca = req->ca;
-
- if (unlikely(is_superblock_bucket(c, ca, bucket)))
- return NULL;
-
- if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- req->counters.skipped_nouse++;
- return NULL;
- }
-
- spin_lock(&c->freelist_lock);
-
- if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
- if (cl)
- closure_wait(&c->open_buckets_wait, cl);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
- spin_unlock(&c->freelist_lock);
- return ERR_PTR(bch_err_throw(c, open_buckets_empty));
- }
-
- /* Recheck under lock: */
- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
- spin_unlock(&c->freelist_lock);
- req->counters.skipped_open++;
- return NULL;
- }
-
- struct open_bucket *ob = bch2_open_bucket_alloc(c);
-
- spin_lock(&ob->lock);
- ob->valid = true;
- ob->sectors_free = ca->mi.bucket_size;
- ob->dev = ca->dev_idx;
- ob->gen = gen;
- ob->bucket = bucket;
- spin_unlock(&ob->lock);
-
- ca->nr_open_buckets++;
- bch2_open_bucket_hash_add(c, ob);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
- track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
-
- spin_unlock(&c->freelist_lock);
- return ob;
-}
-
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
- struct alloc_request *req,
- struct btree_iter *freespace_iter,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
-
- if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
- return NULL;
-
- u8 gen;
- int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret)
- return NULL;
-
- return __try_alloc_bucket(c, req, b, gen, cl);
-}
-
-/*
- * This path is for before the freespace btree is initialized:
- */
-static noinline struct open_bucket *
-bch2_bucket_alloc_early(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = req->ca;
- struct btree_iter iter, citer;
- struct bkey_s_c k, ck;
- struct open_bucket *ob = NULL;
- u64 first_bucket = ca->mi.first_bucket;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
- u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
- u64 alloc_cursor = alloc_start;
- int ret;
-
- /*
- * Scan with an uncached iterator to avoid polluting the key cache. An
- * uncached iter will return a cached key if one exists, but if not
- * there is no other underlying protection for the associated key cache
- * slot. To avoid racing bucket allocations, look up the cached key slot
- * of any likely allocation candidate before attempting to proceed with
- * the allocation. This provides proper exclusion on the associated
- * bucket.
- */
-again:
- for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
- BTREE_ITER_slots, k, ret) {
- u64 bucket = k.k->p.offset;
-
- if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
- break;
-
- if (req->btree_bitmap != BTREE_BITMAP_ANY &&
- req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
- bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (req->btree_bitmap == BTREE_BITMAP_YES &&
- bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
- break;
-
- bucket = sector_to_bucket(ca,
- round_up(bucket_to_sector(ca, bucket) + 1,
- 1ULL << ca->mi.btree_bitmap_shift));
- bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
- req->counters.buckets_seen++;
- req->counters.skipped_mi_btree_bitmap++;
- continue;
- }
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- if (a->data_type != BCH_DATA_free)
- continue;
-
- /* now check the cached key to serialize concurrent allocs of the bucket */
- ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached);
- ret = bkey_err(ck);
- if (ret)
- break;
-
- a = bch2_alloc_to_v4(ck, &a_convert);
- if (a->data_type != BCH_DATA_free)
- goto next;
-
- req->counters.buckets_seen++;
-
- ob = may_alloc_bucket(c, req, k.k->p)
- ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
- : NULL;
-next:
- bch2_set_btree_iter_dontneed(trans, &citer);
- bch2_trans_iter_exit(trans, &citer);
- if (ob)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- alloc_cursor = iter.pos.offset;
-
- if (!ob && ret)
- ob = ERR_PTR(ret);
-
- if (!ob && alloc_start > first_bucket) {
- alloc_cursor = alloc_start = first_bucket;
- goto again;
- }
-
- *dev_alloc_cursor = alloc_cursor;
-
- return ob;
-}
-
-static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
-{
- struct bch_dev *ca = req->ca;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct open_bucket *ob = NULL;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
- u64 alloc_cursor = alloc_start;
- int ret;
-again:
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
- POS(ca->dev_idx, alloc_cursor),
- POS(ca->dev_idx, U64_MAX),
- 0, k, ret) {
- /*
- * peek normally dosen't trim extents - they can span iter.pos,
- * which is not what we want here:
- */
- iter.k.size = iter.k.p.offset - iter.pos.offset;
-
- while (iter.k.size) {
- req->counters.buckets_seen++;
-
- u64 bucket = iter.pos.offset & ~(~0ULL << 56);
- if (req->btree_bitmap != BTREE_BITMAP_ANY &&
- req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
- bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (req->btree_bitmap == BTREE_BITMAP_YES &&
- bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
- goto fail;
-
- bucket = sector_to_bucket(ca,
- round_up(bucket_to_sector(ca, bucket + 1),
- 1ULL << ca->mi.btree_bitmap_shift));
- alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
-
- bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
- req->counters.skipped_mi_btree_bitmap++;
- goto next;
- }
-
- ob = try_alloc_bucket(trans, req, &iter, cl);
- if (ob) {
- if (!IS_ERR(ob))
- *dev_alloc_cursor = iter.pos.offset;
- bch2_set_btree_iter_dontneed(trans, &iter);
- break;
- }
-
- iter.k.size--;
- iter.pos.offset++;
- }
-next:
- if (ob || ret)
- break;
- }
-fail:
- bch2_trans_iter_exit(trans, &iter);
-
- BUG_ON(ob && ret);
-
- if (ret)
- ob = ERR_PTR(ret);
-
- if (!ob && alloc_start > ca->mi.first_bucket) {
- alloc_cursor = alloc_start = ca->mi.first_bucket;
- goto again;
- }
-
- return ob;
-}
-
-static noinline void trace_bucket_alloc2(struct bch_fs *c,
- struct alloc_request *req,
- struct closure *cl,
- struct open_bucket *ob)
-{
- struct printbuf buf = PRINTBUF;
-
- printbuf_tabstop_push(&buf, 24);
-
- prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx);
- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]);
- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]);
- prt_printf(&buf, "blocking\t%u\n", cl != NULL);
- prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]);
- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark));
- prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
- prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen);
- prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open);
- prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
- prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow);
- prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse);
- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
-
- if (!IS_ERR(ob)) {
- prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
- trace_bucket_alloc(c, buf.buf);
- } else {
- prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob)));
- trace_bucket_alloc_fail(c, buf.buf);
- }
-
- printbuf_exit(&buf);
-}
-
-/**
- * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
- * @trans: transaction object
- * @req: state for the entire allocation
- * @cl: if not NULL, closure to be used to wait if buckets not available
- * @nowait: if true, do not wait for buckets to become available
- *
- * Returns: an open_bucket on success, or an ERR_PTR() on failure.
- */
-static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl,
- bool nowait)
-{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = req->ca;
- struct open_bucket *ob = NULL;
- bool freespace = READ_ONCE(ca->mi.freespace_initialized);
- u64 avail;
- bool waiting = nowait;
-
- req->btree_bitmap = req->data_type == BCH_DATA_btree;
- memset(&req->counters, 0, sizeof(req->counters));
-again:
- bch2_dev_usage_read_fast(ca, &req->usage);
- avail = dev_buckets_free(ca, req->usage, req->watermark);
-
- if (req->usage.buckets[BCH_DATA_need_discard] >
- min(avail, ca->mi.nbuckets >> 7))
- bch2_dev_do_discards(ca);
-
- if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
- bch2_gc_gens_async(c);
-
- if (should_invalidate_buckets(ca, req->usage))
- bch2_dev_do_invalidates(ca);
-
- if (!avail) {
- if (req->watermark > BCH_WATERMARK_normal &&
- c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
- goto alloc;
-
- if (cl && !waiting) {
- closure_wait(&c->freelist_wait, cl);
- waiting = true;
- goto again;
- }
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
-
- ob = ERR_PTR(bch_err_throw(c, freelist_empty));
- goto err;
- }
-
- if (waiting)
- closure_wake_up(&c->freelist_wait);
-alloc:
- ob = likely(freespace)
- ? bch2_bucket_alloc_freelist(trans, req, cl)
- : bch2_bucket_alloc_early(trans, req, cl);
-
- if (req->counters.need_journal_commit * 2 > avail)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
- req->btree_bitmap = BTREE_BITMAP_ANY;
- goto alloc;
- }
-
- if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
- freespace = false;
- goto alloc;
- }
-err:
- if (!ob)
- ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
-
- if (!IS_ERR(ob))
- ob->data_type = req->data_type;
-
- if (!IS_ERR(ob))
- count_event(c, bucket_alloc);
- else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
- count_event(c, bucket_alloc_fail);
-
- if (!IS_ERR(ob)
- ? trace_bucket_alloc_enabled()
- : trace_bucket_alloc_fail_enabled())
- trace_bucket_alloc2(c, req, cl, ob);
-
- return ob;
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
- struct closure *cl)
-{
- struct open_bucket *ob;
- struct alloc_request req = {
- .watermark = watermark,
- .data_type = data_type,
- .ca = ca,
- };
-
- bch2_trans_do(c,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
- return ob;
-}
-
-static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
- unsigned l, unsigned r)
-{
- return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
-}
-
-#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-
-void bch2_dev_alloc_list(struct bch_fs *c,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs,
- struct dev_alloc_list *ret)
-{
- ret->nr = 0;
-
- unsigned i;
- for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret->data[ret->nr++] = i;
-
- bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
-}
-
-static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
-static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
-static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
-
-static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
-{
- /*
- * Avoid underflowing clock hands if at all possible, if clock hands go
- * to 0 then we lose information - clock hands can be in a wide range if
- * we have devices we rarely try to allocate from, if we generally
- * allocate from a specified target but only sometimes have to fall back
- * to the whole filesystem.
- */
- u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
- u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
-
- for (u64 *v = stripe->next_alloc;
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
- if (*v)
- scale_max = min(scale_max, *v);
- if (*v > stripe_clock_hand_max)
- scale_min = max(scale_min, *v - stripe_clock_hand_max);
- }
-
- u64 scale = max(scale_min, scale_max);
-
- for (u64 *v = stripe->next_alloc;
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
- *v = *v < scale ? 0 : *v - scale;
-}
-
-static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
- struct dev_stripe_state *stripe,
- struct bch_dev_usage *usage)
-{
- /*
- * Stripe state has a per device clock hand: we allocate from the device
- * with the smallest clock hand.
- *
- * When we allocate, we don't do a simple increment; we add the inverse
- * of the device's free space. This results in round robin behavior that
- * biases in favor of the device(s) with more free space.
- */
-
- u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
- u64 free_space_inv = free_space
- ? div64_u64(stripe_clock_hand_inv, free_space)
- : stripe_clock_hand_inv;
-
- /* Saturating add, avoid overflow: */
- u64 sum = *v + free_space_inv;
- *v = sum >= *v ? sum : U64_MAX;
-
- if (unlikely(*v > stripe_clock_hand_rescale))
- bch2_stripe_state_rescale(stripe);
-}
-
-void bch2_dev_stripe_increment(struct bch_dev *ca,
- struct dev_stripe_state *stripe)
-{
- struct bch_dev_usage usage;
-
- bch2_dev_usage_read_fast(ca, &usage);
- bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
-}
-
-static int add_new_bucket(struct bch_fs *c,
- struct alloc_request *req,
- struct open_bucket *ob)
-{
- unsigned durability = ob_dev(c, ob)->mi.durability;
-
- BUG_ON(req->nr_effective >= req->nr_replicas);
-
- __clear_bit(ob->dev, req->devs_may_alloc.d);
- req->nr_effective += durability;
- req->have_cache |= !durability;
-
- ob_push(c, &req->ptrs, ob);
-
- if (req->nr_effective >= req->nr_replicas)
- return 1;
- if (ob->ec)
- return 1;
- return 0;
-}
-
-inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
- struct alloc_request *req,
- struct dev_stripe_state *stripe,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- BUG_ON(req->nr_effective >= req->nr_replicas);
-
- bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
-
- darray_for_each(req->devs_sorted, i) {
- req->ca = bch2_dev_tryget_noerror(c, *i);
- if (!req->ca)
- continue;
-
- if (!req->ca->mi.durability && req->have_cache) {
- bch2_dev_put(req->ca);
- continue;
- }
-
- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
- req->flags & BCH_WRITE_alloc_nowait);
- if (!IS_ERR(ob))
- bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
- bch2_dev_put(req->ca);
-
- if (IS_ERR(ob)) {
- ret = PTR_ERR(ob);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
- break;
- continue;
- }
-
- ret = add_new_bucket(c, req, ob);
- if (ret)
- break;
- }
-
- if (ret == 1)
- return 0;
- if (ret)
- return ret;
- return bch_err_throw(c, insufficient_devices);
-}
-
-/* Allocate from stripes: */
-
-/*
- * if we can't allocate a new stripe because there are already too many
- * partially filled stripes, force allocating from an existing stripe even when
- * it's to a device we don't want:
- */
-
-static int bucket_alloc_from_stripe(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- if (req->nr_replicas < 2)
- return 0;
-
- if (ec_open_bucket(c, &req->ptrs))
- return 0;
-
- struct ec_stripe_head *h =
- bch2_ec_stripe_head_get(trans, req, 0, cl);
- if (IS_ERR(h))
- return PTR_ERR(h);
- if (!h)
- return 0;
-
- bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
-
- darray_for_each(req->devs_sorted, i)
- for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
- if (!h->s->blocks[ec_idx])
- continue;
-
- struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
- if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
- ob->ec_idx = ec_idx;
- ob->ec = h->s;
- ec_stripe_new_get(h->s, STRIPE_REF_io);
-
- ret = add_new_bucket(c, req, ob);
- goto out;
- }
- }
-out:
- bch2_ec_stripe_head_put(c, h);
- return ret;
-}
-
-/* Sector allocator */
-
-static bool want_bucket(struct bch_fs *c,
- struct alloc_request *req,
- struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- if (!test_bit(ob->dev, req->devs_may_alloc.d))
- return false;
-
- if (ob->data_type != req->wp->data_type)
- return false;
-
- if (!ca->mi.durability &&
- (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
- return false;
-
- if (req->ec != (ob->ec != NULL))
- return false;
-
- return true;
-}
-
-static int bucket_alloc_set_writepoint(struct bch_fs *c,
- struct alloc_request *req)
-{
- struct open_bucket *ob;
- unsigned i;
- int ret = 0;
-
- req->scratch_ptrs.nr = 0;
-
- open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
- if (!ret && want_bucket(c, req, ob))
- ret = add_new_bucket(c, req, ob);
- else
- ob_push(c, &req->scratch_ptrs, ob);
- }
- req->wp->ptrs = req->scratch_ptrs;
-
- return ret;
-}
-
-static int bucket_alloc_set_partial(struct bch_fs *c,
- struct alloc_request *req)
-{
- int i, ret = 0;
-
- if (!c->open_buckets_partial_nr)
- return 0;
-
- spin_lock(&c->freelist_lock);
-
- if (!c->open_buckets_partial_nr)
- goto unlock;
-
- for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
- struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
-
- if (want_bucket(c, req, ob)) {
- struct bch_dev *ca = ob_dev(c, ob);
- u64 avail;
-
- bch2_dev_usage_read_fast(ca, &req->usage);
- avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
- if (!avail)
- continue;
-
- array_remove_item(c->open_buckets_partial,
- c->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
-
- scoped_guard(rcu)
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-
- ret = add_new_bucket(c, req, ob);
- if (ret)
- break;
- }
- }
-unlock:
- spin_unlock(&c->freelist_lock);
- return ret;
-}
-
-static int __open_bucket_add_buckets(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *_cl)
-{
- struct bch_fs *c = trans->c;
- struct open_bucket *ob;
- struct closure *cl = NULL;
- unsigned i;
- int ret;
-
- req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
-
- /* Don't allocate from devices we already have pointers to: */
- darray_for_each(*req->devs_have, i)
- __clear_bit(*i, req->devs_may_alloc.d);
-
- open_bucket_for_each(c, &req->ptrs, ob, i)
- __clear_bit(ob->dev, req->devs_may_alloc.d);
-
- ret = bucket_alloc_set_writepoint(c, req);
- if (ret)
- return ret;
-
- ret = bucket_alloc_set_partial(c, req);
- if (ret)
- return ret;
-
- if (req->ec) {
- ret = bucket_alloc_from_stripe(trans, req, _cl);
- } else {
-retry_blocking:
- /*
- * Try nonblocking first, so that if one device is full we'll try from
- * other devices:
- */
- ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
- !cl && _cl) {
- cl = _cl;
- goto retry_blocking;
- }
- }
-
- return ret;
-}
-
-static int open_bucket_add_buckets(struct btree_trans *trans,
- struct alloc_request *req,
- struct closure *cl)
-{
- int ret;
-
- if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
- ret = __open_bucket_add_buckets(trans, req, cl);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
- bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
- bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- return ret;
- if (req->nr_effective >= req->nr_replicas)
- return 0;
- }
-
- bool ec = false;
- swap(ec, req->ec);
- ret = __open_bucket_add_buckets(trans, req, cl);
- swap(ec, req->ec);
-
- return ret < 0 ? ret : 0;
-}
-
-/**
- * should_drop_bucket - check if this is open_bucket should go away
- * @ob: open_bucket to predicate on
- * @c: filesystem handle
- * @ca: if set, we're killing buckets for a particular device
- * @ec: if true, we're shutting down erasure coding and killing all ec
- * open_buckets
- * otherwise, return true
- * Returns: true if we should kill this open_bucket
- *
- * We're killing open_buckets because we're shutting down a device, erasure
- * coding, or the entire filesystem - check if this open_bucket matches:
- */
-static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
- struct bch_dev *ca, bool ec)
-{
- if (ec) {
- return ob->ec != NULL;
- } else if (ca) {
- bool drop = ob->dev == ca->dev_idx;
- struct open_bucket *ob2;
- unsigned i;
-
- if (!drop && ob->ec) {
- unsigned nr_blocks;
-
- mutex_lock(&ob->ec->lock);
- nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
-
- for (i = 0; i < nr_blocks; i++) {
- if (!ob->ec->blocks[i])
- continue;
-
- ob2 = c->open_buckets + ob->ec->blocks[i];
- drop |= ob2->dev == ca->dev_idx;
- }
- mutex_unlock(&ob->ec->lock);
- }
-
- return drop;
- } else {
- return true;
- }
-}
-
-static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
- bool ec, struct write_point *wp)
-{
- struct open_buckets ptrs = { .nr = 0 };
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&wp->lock);
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (should_drop_bucket(ob, c, ca, ec))
- bch2_open_bucket_put(c, ob);
- else
- ob_push(c, &ptrs, ob);
- wp->ptrs = ptrs;
- mutex_unlock(&wp->lock);
-}
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
- bool ec)
-{
- unsigned i;
-
- /* Next, close write points that point to this device... */
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
-
- bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
- bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
- bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- while (c->btree_reserve_cache_nr) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- bch2_open_buckets_put(c, &a->ob);
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- spin_lock(&c->freelist_lock);
- i = 0;
- while (i < c->open_buckets_partial_nr) {
- struct open_bucket *ob =
- c->open_buckets + c->open_buckets_partial[i];
-
- if (should_drop_bucket(ob, c, ca, ec)) {
- --c->open_buckets_partial_nr;
- swap(c->open_buckets_partial[i],
- c->open_buckets_partial[c->open_buckets_partial_nr]);
-
- ob->on_partial_list = false;
-
- scoped_guard(rcu)
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
-
- spin_unlock(&c->freelist_lock);
- bch2_open_bucket_put(c, ob);
- spin_lock(&c->freelist_lock);
- } else {
- i++;
- }
- }
- spin_unlock(&c->freelist_lock);
-
- bch2_ec_stop_dev(c, ca);
-}
-
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
- unsigned long write_point)
-{
- unsigned hash =
- hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
- return &c->write_points_hash[hash];
-}
-
-static struct write_point *__writepoint_find(struct hlist_head *head,
- unsigned long write_point)
-{
- struct write_point *wp;
-
- guard(rcu)();
- hlist_for_each_entry_rcu(wp, head, node)
- if (wp->write_point == write_point)
- return wp;
- return NULL;
-}
-
-static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
-{
- u64 stranded = c->write_points_nr * c->bucket_size_max;
- u64 free = bch2_fs_usage_read_short(c).free;
-
- return stranded * factor > free;
-}
-
-static noinline bool try_increase_writepoints(struct bch_fs *c)
-{
- struct write_point *wp;
-
- if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
- too_many_writepoints(c, 32))
- return false;
-
- wp = c->write_points + c->write_points_nr++;
- hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
- return true;
-}
-
-static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp;
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&c->write_points_hash_lock);
- if (c->write_points_nr < old_nr) {
- mutex_unlock(&c->write_points_hash_lock);
- return true;
- }
-
- if (c->write_points_nr == 1 ||
- !too_many_writepoints(c, 8)) {
- mutex_unlock(&c->write_points_hash_lock);
- return false;
- }
-
- wp = c->write_points + --c->write_points_nr;
-
- hlist_del_rcu(&wp->node);
- mutex_unlock(&c->write_points_hash_lock);
-
- bch2_trans_mutex_lock_norelock(trans, &wp->lock);
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- open_bucket_free_unused(c, ob);
- wp->ptrs.nr = 0;
- mutex_unlock(&wp->lock);
- return true;
-}
-
-static struct write_point *writepoint_find(struct btree_trans *trans,
- unsigned long write_point)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp, *oldest;
- struct hlist_head *head;
-
- if (!(write_point & 1UL)) {
- wp = (struct write_point *) write_point;
- bch2_trans_mutex_lock_norelock(trans, &wp->lock);
- return wp;
- }
-
- head = writepoint_hash(c, write_point);
-restart_find:
- wp = __writepoint_find(head, write_point);
- if (wp) {
-lock_wp:
- bch2_trans_mutex_lock_norelock(trans, &wp->lock);
- if (wp->write_point == write_point)
- goto out;
- mutex_unlock(&wp->lock);
- goto restart_find;
- }
-restart_find_oldest:
- oldest = NULL;
- for (wp = c->write_points;
- wp < c->write_points + c->write_points_nr; wp++)
- if (!oldest || time_before64(wp->last_used, oldest->last_used))
- oldest = wp;
-
- bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
- bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
- if (oldest >= c->write_points + c->write_points_nr ||
- try_increase_writepoints(c)) {
- mutex_unlock(&c->write_points_hash_lock);
- mutex_unlock(&oldest->lock);
- goto restart_find_oldest;
- }
-
- wp = __writepoint_find(head, write_point);
- if (wp && wp != oldest) {
- mutex_unlock(&c->write_points_hash_lock);
- mutex_unlock(&oldest->lock);
- goto lock_wp;
- }
-
- wp = oldest;
- hlist_del_rcu(&wp->node);
- wp->write_point = write_point;
- hlist_add_head_rcu(&wp->node, head);
- mutex_unlock(&c->write_points_hash_lock);
-out:
- wp->last_used = local_clock();
- return wp;
-}
-
-static noinline void
-deallocate_extra_replicas(struct bch_fs *c,
- struct alloc_request *req)
-{
- struct open_bucket *ob;
- unsigned extra_replicas = req->nr_effective - req->nr_replicas;
- unsigned i;
-
- req->scratch_ptrs.nr = 0;
-
- open_bucket_for_each(c, &req->ptrs, ob, i) {
- unsigned d = ob_dev(c, ob)->mi.durability;
-
- if (d && d <= extra_replicas) {
- extra_replicas -= d;
- ob_push(c, &req->wp->ptrs, ob);
- } else {
- ob_push(c, &req->scratch_ptrs, ob);
- }
- }
-
- req->ptrs = req->scratch_ptrs;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
- unsigned target,
- unsigned erasure_code,
- struct write_point_specifier write_point,
- struct bch_devs_list *devs_have,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl,
- struct write_point **wp_ret)
-{
- struct bch_fs *c = trans->c;
- struct open_bucket *ob;
- unsigned write_points_nr;
- int i;
-
- struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
- int ret = PTR_ERR_OR_ZERO(req);
- if (unlikely(ret))
- return ret;
-
- if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
- erasure_code = false;
-
- req->nr_replicas = nr_replicas;
- req->target = target;
- req->ec = erasure_code;
- req->watermark = watermark;
- req->flags = flags;
- req->devs_have = devs_have;
-
- BUG_ON(!nr_replicas || !nr_replicas_required);
-retry:
- req->ptrs.nr = 0;
- req->nr_effective = 0;
- req->have_cache = false;
- write_points_nr = c->write_points_nr;
-
- *wp_ret = req->wp = writepoint_find(trans, write_point.v);
-
- req->data_type = req->wp->data_type;
-
- ret = bch2_trans_relock(trans);
- if (ret)
- goto err;
-
- /* metadata may not allocate on cache devices: */
- if (req->data_type != BCH_DATA_user)
- req->have_cache = true;
-
- if (target && !(flags & BCH_WRITE_only_specified_devs)) {
- ret = open_bucket_add_buckets(trans, req, NULL);
- if (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto alloc_done;
-
- /* Don't retry from all devices if we're out of open buckets: */
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret2 = open_bucket_add_buckets(trans, req, cl);
- if (!ret2 ||
- bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
- ret = ret2;
- goto alloc_done;
- }
- }
-
- /*
- * Only try to allocate cache (durability = 0 devices) from the
- * specified target:
- */
- req->have_cache = true;
- req->target = 0;
-
- ret = open_bucket_add_buckets(trans, req, cl);
- } else {
- ret = open_bucket_add_buckets(trans, req, cl);
- }
-alloc_done:
- BUG_ON(!ret && req->nr_effective < req->nr_replicas);
-
- if (erasure_code && !ec_open_bucket(c, &req->ptrs))
- pr_debug("failed to get ec bucket: ret %u", ret);
-
- if (ret == -BCH_ERR_insufficient_devices &&
- req->nr_effective >= nr_replicas_required)
- ret = 0;
-
- if (ret)
- goto err;
-
- if (req->nr_effective > req->nr_replicas)
- deallocate_extra_replicas(c, req);
-
- /* Free buckets we didn't use: */
- open_bucket_for_each(c, &req->wp->ptrs, ob, i)
- open_bucket_free_unused(c, ob);
-
- req->wp->ptrs = req->ptrs;
-
- req->wp->sectors_free = UINT_MAX;
-
- open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
- /*
- * Ensure proper write alignment - either due to misaligned
- * bucket sizes (from buggy bcachefs-tools), or writes that mix
- * logical/physical alignment:
- */
- struct bch_dev *ca = ob_dev(c, ob);
- u64 offset = bucket_to_sector(ca, ob->bucket) +
- ca->mi.bucket_size -
- ob->sectors_free;
- unsigned align = round_up(offset, block_sectors(c)) - offset;
-
- ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
-
- req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
- }
-
- req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
-
- /* Did alignment use up space in an open_bucket? */
- if (unlikely(!req->wp->sectors_free)) {
- bch2_alloc_sectors_done(c, req->wp);
- goto retry;
- }
-
- BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
-
- return 0;
-err:
- open_bucket_for_each(c, &req->wp->ptrs, ob, i)
- if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
- ob_push(c, &req->ptrs, ob);
- else
- open_bucket_free_unused(c, ob);
- req->wp->ptrs = req->ptrs;
-
- mutex_unlock(&req->wp->lock);
-
- if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
- try_decrease_writepoints(trans, write_points_nr))
- goto retry;
-
- if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- ret = bch_err_throw(c, bucket_alloc_blocked);
-
- if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
- bch2_err_matches(ret, BCH_ERR_freelist_empty))
- ret = bch_err_throw(c, bucket_alloc_blocked);
-
- return ret;
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
- struct bkey_i *k, unsigned sectors,
- bool cached)
-{
- bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
-{
- bch2_alloc_sectors_done_inlined(c, wp);
-}
-
-static inline void writepoint_init(struct write_point *wp,
- enum bch_data_type type)
-{
- mutex_init(&wp->lock);
- wp->data_type = type;
-
- INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
- INIT_LIST_HEAD(&wp->writes);
- spin_lock_init(&wp->writes_lock);
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *c)
-{
- struct open_bucket *ob;
- struct write_point *wp;
-
- mutex_init(&c->write_points_hash_lock);
- c->write_points_nr = ARRAY_SIZE(c->write_points);
-
- /* open bucket 0 is a sentinal NULL: */
- spin_lock_init(&c->open_buckets[0].lock);
-
- for (ob = c->open_buckets + 1;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
- spin_lock_init(&ob->lock);
- c->open_buckets_nr_free++;
-
- ob->freelist = c->open_buckets_freelist;
- c->open_buckets_freelist = ob - c->open_buckets;
- }
-
- writepoint_init(&c->btree_write_point, BCH_DATA_btree);
- writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
- writepoint_init(&c->copygc_write_point, BCH_DATA_user);
-
- for (wp = c->write_points;
- wp < c->write_points + c->write_points_nr; wp++) {
- writepoint_init(wp, BCH_DATA_user);
-
- wp->last_used = local_clock();
- wp->write_point = (unsigned long) wp;
- hlist_add_head_rcu(&wp->node,
- writepoint_hash(c, wp->write_point));
- }
-}
-
-void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
- unsigned data_type = ob->data_type;
- barrier(); /* READ_ONCE() doesn't work on bitfields */
-
- prt_printf(out, "%zu ref %u ",
- ob - c->open_buckets,
- atomic_read(&ob->pin));
- bch2_prt_data_type(out, data_type);
- prt_printf(out, " %u:%llu gen %u allocated %u/%u",
- ob->dev, ob->bucket, ob->gen,
- ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
- if (ob->ec)
- prt_printf(out, " ec idx %llu", ob->ec->idx);
- if (ob->on_partial_list)
- prt_str(out, " partial");
- prt_newline(out);
-}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_dev *ca)
-{
- struct open_bucket *ob;
-
- out->atomic++;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && (!ca || ob->dev == ca->dev_idx))
- bch2_open_bucket_to_text(out, c, ob);
- spin_unlock(&ob->lock);
- }
-
- --out->atomic;
-}
-
-void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
-{
- unsigned i;
-
- out->atomic++;
- spin_lock(&c->freelist_lock);
-
- for (i = 0; i < c->open_buckets_partial_nr; i++)
- bch2_open_bucket_to_text(out, c,
- c->open_buckets + c->open_buckets_partial[i]);
-
- spin_unlock(&c->freelist_lock);
- --out->atomic;
-}
-
-static const char * const bch2_write_point_states[] = {
-#define x(n) #n,
- WRITE_POINT_STATES()
-#undef x
- NULL
-};
-
-static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
- struct write_point *wp)
-{
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&wp->lock);
-
- prt_printf(out, "%lu: ", wp->write_point);
- prt_human_readable_u64(out, wp->sectors_allocated << 9);
-
- prt_printf(out, " last wrote: ");
- bch2_pr_time_units(out, sched_clock() - wp->last_used);
-
- for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
- prt_printf(out, " %s: ", bch2_write_point_states[i]);
- bch2_pr_time_units(out, wp->time[i]);
- }
-
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- bch2_open_bucket_to_text(out, c, ob);
- printbuf_indent_sub(out, 2);
-
- mutex_unlock(&wp->lock);
-}
-
-void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct write_point *wp;
-
- prt_str(out, "Foreground write points\n");
- for (wp = c->write_points;
- wp < c->write_points + ARRAY_SIZE(c->write_points);
- wp++)
- bch2_write_point_to_text(out, c, wp);
-
- prt_str(out, "Copygc write point\n");
- bch2_write_point_to_text(out, c, &c->copygc_write_point);
-
- prt_str(out, "Rebalance write point\n");
- bch2_write_point_to_text(out, c, &c->rebalance_write_point);
-
- prt_str(out, "Btree write point\n");
- bch2_write_point_to_text(out, c, &c->btree_write_point);
-}
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
- unsigned nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 24);
-
- prt_printf(out, "capacity\t%llu\n", c->capacity);
- prt_printf(out, "reserved\t%llu\n", c->reserved);
- prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
- prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
- prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
- prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
- prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
- prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
- prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
-
- prt_newline(out);
- prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
- prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
- prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT);
- prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty");
- prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]);
- prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]);
- prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr);
-}
-
-void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
- unsigned nr[BCH_DATA_NR];
-
- memset(nr, 0, sizeof(nr));
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].data_type]++;
-
- bch2_dev_usage_to_text(out, ca, &stats);
-
- prt_newline(out);
-
- prt_printf(out, "reserves:\n");
- for (unsigned i = 0; i < BCH_WATERMARK_NR; i++)
- prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i));
-
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
-
- prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
- prt_printf(out, "buckets to invalidate\t%llu\r\n",
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
-}
-
-static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
-{
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
- c->opts.allocator_stuck_timeout);
-
- prt_printf(&buf, "Allocator debug:\n");
- printbuf_indent_add(&buf, 2);
- bch2_fs_alloc_debug_to_text(&buf, c);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
-
- bch2_printbuf_make_room(&buf, 4096);
-
- buf.atomic++;
- scoped_guard(rcu)
- for_each_online_member_rcu(c, ca) {
- prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
- printbuf_indent_add(&buf, 2);
- bch2_dev_alloc_debug_to_text(&buf, ca);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
- --buf.atomic;
-
- prt_printf(&buf, "Copygc debug:\n");
- printbuf_indent_add(&buf, 2);
- bch2_copygc_wait_to_text(&buf, c);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
-
- prt_printf(&buf, "Journal debug:\n");
- printbuf_indent_add(&buf, 2);
- bch2_journal_debug_to_text(&buf, &c->journal);
- printbuf_indent_sub(&buf, 2);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-}
-
-static inline unsigned allocator_wait_timeout(struct bch_fs *c)
-{
- if (c->allocator_last_stuck &&
- time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
- return 0;
-
- return c->opts.allocator_stuck_timeout * HZ;
-}
-
-void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
- unsigned t = allocator_wait_timeout(c);
-
- if (t && closure_sync_timeout(cl, t)) {
- c->allocator_last_stuck = jiffies;
- bch2_print_allocator_stuck(c);
- }
-
- closure_sync(cl);
-}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
deleted file mode 100644
index 1b3fc8460096..000000000000
--- a/fs/bcachefs/alloc_foreground.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
-#define _BCACHEFS_ALLOC_FOREGROUND_H
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "alloc_types.h"
-#include "extents.h"
-#include "io_write_types.h"
-#include "sb-members.h"
-
-#include <linux/hash.h>
-
-struct bkey;
-struct bch_dev;
-struct bch_fs;
-struct bch_devs_List;
-
-extern const char * const bch2_watermarks[];
-
-void bch2_reset_alloc_cursors(struct bch_fs *);
-
-struct dev_alloc_list {
- unsigned nr;
- u8 data[BCH_SB_MEMBERS_MAX];
-};
-
-struct alloc_request {
- unsigned nr_replicas;
- unsigned target;
- bool ec;
- enum bch_watermark watermark;
- enum bch_write_flags flags;
- enum bch_data_type data_type;
- struct bch_devs_list *devs_have;
- struct write_point *wp;
-
- /* These fields are used primarily by open_bucket_add_buckets */
- struct open_buckets ptrs;
- unsigned nr_effective; /* sum of @ptrs durability */
- bool have_cache; /* have we allocated from a 0 durability dev */
- struct bch_devs_mask devs_may_alloc;
-
- /* bch2_bucket_alloc_set_trans(): */
- struct dev_alloc_list devs_sorted;
- struct bch_dev_usage usage;
-
- /* bch2_bucket_alloc_trans(): */
- struct bch_dev *ca;
-
- enum {
- BTREE_BITMAP_NO,
- BTREE_BITMAP_YES,
- BTREE_BITMAP_ANY,
- } btree_bitmap;
-
- struct {
- u64 buckets_seen;
- u64 skipped_open;
- u64 skipped_need_journal_commit;
- u64 need_journal_commit;
- u64 skipped_nocow;
- u64 skipped_nouse;
- u64 skipped_mi_btree_bitmap;
- } counters;
-
- unsigned scratch_nr_replicas;
- unsigned scratch_nr_effective;
- bool scratch_have_cache;
- enum bch_data_type scratch_data_type;
- struct open_buckets scratch_ptrs;
- struct bch_devs_mask scratch_devs_may_alloc;
-};
-
-void bch2_dev_alloc_list(struct bch_fs *,
- struct dev_stripe_state *,
- struct bch_devs_mask *,
- struct dev_alloc_list *);
-void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
-
-static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
-{
- return bch2_dev_have_ref(c, ob->dev);
-}
-
-static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
-{
- switch (watermark) {
- case BCH_WATERMARK_interior_updates:
- return 0;
- case BCH_WATERMARK_reclaim:
- return OPEN_BUCKETS_COUNT / 6;
- case BCH_WATERMARK_btree:
- case BCH_WATERMARK_btree_copygc:
- return OPEN_BUCKETS_COUNT / 4;
- case BCH_WATERMARK_copygc:
- return OPEN_BUCKETS_COUNT / 3;
- default:
- return OPEN_BUCKETS_COUNT / 2;
- }
-}
-
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
- enum bch_watermark, enum bch_data_type,
- struct closure *);
-
-static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
- struct open_bucket *ob)
-{
- BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
-
- obs->v[obs->nr++] = ob - c->open_buckets;
-}
-
-#define open_bucket_for_each(_c, _obs, _ob, _i) \
- for ((_i) = 0; \
- (_i) < (_obs)->nr && \
- ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \
- (_i)++)
-
-static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
- struct open_buckets *obs)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, obs, ob, i)
- if (ob->ec)
- return ob;
-
- return NULL;
-}
-
-void bch2_open_bucket_write_error(struct bch_fs *,
- struct open_buckets *, unsigned, int);
-
-void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
- if (atomic_dec_and_test(&ob->pin))
- __bch2_open_bucket_put(c, ob);
-}
-
-static inline void bch2_open_buckets_put(struct bch_fs *c,
- struct open_buckets *ptrs)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, ptrs, ob, i)
- bch2_open_bucket_put(c, ob);
- ptrs->nr = 0;
-}
-
-static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
-{
- struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- ob_push(c, ob->sectors_free < block_sectors(c)
- ? &ptrs
- : &keep, ob);
- wp->ptrs = keep;
-
- mutex_unlock(&wp->lock);
-
- bch2_open_buckets_put(c, &ptrs);
-}
-
-static inline void bch2_open_bucket_get(struct bch_fs *c,
- struct write_point *wp,
- struct open_buckets *ptrs)
-{
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- ob->data_type = wp->data_type;
- atomic_inc(&ob->pin);
- ob_push(c, ptrs, ob);
- }
-}
-
-static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
- unsigned dev, u64 bucket)
-{
- return c->open_buckets_hash +
- (jhash_3words(dev, bucket, bucket >> 32, 0) &
- (OPEN_BUCKETS_COUNT - 1));
-}
-
-static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
-{
- open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
-
- while (slot) {
- struct open_bucket *ob = &c->open_buckets[slot];
-
- if (ob->dev == dev && ob->bucket == bucket)
- return true;
-
- slot = ob->hash;
- }
-
- return false;
-}
-
-static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
-{
- bool ret;
-
- if (bch2_bucket_is_open(c, dev, bucket))
- return true;
-
- spin_lock(&c->freelist_lock);
- ret = bch2_bucket_is_open(c, dev, bucket);
- spin_unlock(&c->freelist_lock);
-
- return ret;
-}
-
-enum bch_write_flags;
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
- struct dev_stripe_state *, struct closure *);
-
-int bch2_alloc_sectors_start_trans(struct btree_trans *,
- unsigned, unsigned,
- struct write_point_specifier,
- struct bch_devs_list *,
- unsigned, unsigned,
- enum bch_watermark,
- enum bch_write_flags,
- struct closure *,
- struct write_point **);
-
-static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- return (struct bch_extent_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = ob->gen,
- .dev = ob->dev,
- .offset = bucket_to_sector(ca, ob->bucket) +
- ca->mi.bucket_size -
- ob->sectors_free,
- };
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-static inline void
-bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
- struct bkey_i *k, unsigned sectors,
- bool cached)
-{
- struct open_bucket *ob;
- unsigned i;
-
- BUG_ON(sectors > wp->sectors_free);
- wp->sectors_free -= sectors;
- wp->sectors_allocated += sectors;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = ob_dev(c, ob);
- struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
- ptr.cached = cached ||
- (!ca->mi.durability &&
- wp->data_type == BCH_DATA_user);
-
- bch2_bkey_append_ptr(k, ptr);
-
- BUG_ON(sectors > ob->sectors_free);
- ob->sectors_free -= sectors;
- }
-}
-
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
- struct bkey_i *, unsigned, bool);
-void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-
-void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
-
-static inline struct write_point_specifier writepoint_hashed(unsigned long v)
-{
- return (struct write_point_specifier) { .v = v | 1 };
-}
-
-static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
-{
- return (struct write_point_specifier) { .v = (unsigned long) wp };
-}
-
-void bch2_fs_allocator_foreground_init(struct bch_fs *);
-
-void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
-void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
-void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-
-void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
-static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
-{
- if (cl->closure_get_happened)
- __bch2_wait_on_allocator(c, cl);
-}
-
-#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
deleted file mode 100644
index e7becdf22cba..000000000000
--- a/fs/bcachefs/alloc_types.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ALLOC_TYPES_H
-#define _BCACHEFS_ALLOC_TYPES_H
-
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include "clock_types.h"
-#include "fifo.h"
-
-#define BCH_WATERMARKS() \
- x(stripe) \
- x(normal) \
- x(copygc) \
- x(btree) \
- x(btree_copygc) \
- x(reclaim) \
- x(interior_updates)
-
-enum bch_watermark {
-#define x(name) BCH_WATERMARK_##name,
- BCH_WATERMARKS()
-#undef x
- BCH_WATERMARK_NR,
-};
-
-#define BCH_WATERMARK_BITS 3
-#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS)
-
-#define OPEN_BUCKETS_COUNT 1024
-
-#define WRITE_POINT_HASH_NR 32
-#define WRITE_POINT_MAX 32
-
-/*
- * 0 is never a valid open_bucket_idx_t:
- */
-typedef u16 open_bucket_idx_t;
-
-struct open_bucket {
- spinlock_t lock;
- atomic_t pin;
- open_bucket_idx_t freelist;
- open_bucket_idx_t hash;
-
- /*
- * When an open bucket has an ec_stripe attached, this is the index of
- * the block in the stripe this open_bucket corresponds to:
- */
- u8 ec_idx;
- enum bch_data_type data_type:6;
- unsigned valid:1;
- unsigned on_partial_list:1;
-
- u8 dev;
- u8 gen;
- u32 sectors_free;
- u64 bucket;
- struct ec_stripe_new *ec;
-};
-
-#define OPEN_BUCKET_LIST_MAX 15
-
-struct open_buckets {
- open_bucket_idx_t nr;
- open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX];
-};
-
-struct dev_stripe_state {
- u64 next_alloc[BCH_SB_MEMBERS_MAX];
-};
-
-#define WRITE_POINT_STATES() \
- x(stopped) \
- x(waiting_io) \
- x(waiting_work) \
- x(runnable) \
- x(running)
-
-enum write_point_state {
-#define x(n) WRITE_POINT_##n,
- WRITE_POINT_STATES()
-#undef x
- WRITE_POINT_STATE_NR
-};
-
-struct write_point {
- struct {
- struct hlist_node node;
- struct mutex lock;
- u64 last_used;
- unsigned long write_point;
- enum bch_data_type data_type;
-
- /* calculated based on how many pointers we're actually going to use: */
- unsigned sectors_free;
-
- struct open_buckets ptrs;
- struct dev_stripe_state stripe;
-
- u64 sectors_allocated;
- } __aligned(SMP_CACHE_BYTES);
-
- struct {
- struct work_struct index_update_work;
-
- struct list_head writes;
- spinlock_t writes_lock;
-
- enum write_point_state state;
- u64 last_state_change;
- u64 time[WRITE_POINT_STATE_NR];
- u64 last_runtime;
- } __aligned(SMP_CACHE_BYTES);
-};
-
-struct write_point_specifier {
- unsigned long v;
-};
-
-#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
deleted file mode 100644
index a7cd1f0f0964..000000000000
--- a/fs/bcachefs/async_objs.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Async obj debugging: keep asynchronous objects on (very fast) lists, make
- * them visibile in debugfs:
- */
-
-#include "bcachefs.h"
-#include "async_objs.h"
-#include "btree_io.h"
-#include "debug.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/debugfs.h>
-
-static void promote_obj_to_text(struct printbuf *out, void *obj)
-{
- bch2_promote_op_to_text(out, obj);
-}
-
-static void rbio_obj_to_text(struct printbuf *out, void *obj)
-{
- bch2_read_bio_to_text(out, obj);
-}
-
-static void write_op_obj_to_text(struct printbuf *out, void *obj)
-{
- bch2_write_op_to_text(out, obj);
-}
-
-static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj)
-{
- struct btree_read_bio *rbio = obj;
- bch2_btree_read_bio_to_text(out, rbio);
-}
-
-static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj)
-{
- struct btree_write_bio *wbio = obj;
- bch2_bio_to_text(out, &wbio->wbio.bio);
-}
-
-static int bch2_async_obj_list_open(struct inode *inode, struct file *file)
-{
- struct async_obj_list *list = inode->i_private;
- struct dump_iter *i;
-
- i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
- if (!i)
- return -ENOMEM;
-
- file->private_data = i;
- i->from = POS_MIN;
- i->iter = 0;
- i->c = container_of(list, struct bch_fs, async_objs[list->idx]);
- i->list = list;
- i->buf = PRINTBUF;
- return 0;
-}
-
-static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct async_obj_list *list = i->list;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- struct genradix_iter iter;
- void *obj;
- fast_list_for_each_from(&list->list, iter, obj, i->iter) {
- ret = bch2_debugfs_flush_buf(i);
- if (ret)
- return ret;
-
- if (!i->size)
- break;
-
- list->obj_to_text(&i->buf, obj);
- }
-
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
- else
- i->iter = iter.pos;
-
- if (!ret)
- ret = bch2_debugfs_flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static const struct file_operations async_obj_ops = {
- .owner = THIS_MODULE,
- .open = bch2_async_obj_list_open,
- .release = bch2_dump_release,
- .read = bch2_async_obj_list_read,
-};
-
-void bch2_fs_async_obj_debugfs_init(struct bch_fs *c)
-{
- c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir);
-
-#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \
- &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops);
- BCH_ASYNC_OBJ_LISTS()
-#undef x
-}
-
-void bch2_fs_async_obj_exit(struct bch_fs *c)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++)
- fast_list_exit(&c->async_objs[i].list);
-}
-
-int bch2_fs_async_obj_init(struct bch_fs *c)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) {
- if (fast_list_init(&c->async_objs[i].list))
- return -BCH_ERR_ENOMEM_async_obj_init;
- c->async_objs[i].idx = i;
- }
-
-#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text;
- BCH_ASYNC_OBJ_LISTS()
-#undef x
-
- return 0;
-}
diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h
deleted file mode 100644
index cd6489b8cf76..000000000000
--- a/fs/bcachefs/async_objs.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ASYNC_OBJS_H
-#define _BCACHEFS_ASYNC_OBJS_H
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
-static inline void __async_object_list_del(struct fast_list *head, unsigned idx)
-{
- fast_list_remove(head, idx);
-}
-
-static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx)
-{
- int ret = fast_list_add(head, obj);
- *idx = ret > 0 ? ret : 0;
- return ret < 0 ? ret : 0;
-}
-
-#define async_object_list_del(_c, _list, idx) \
- __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx)
-
-#define async_object_list_add(_c, _list, obj, idx) \
- __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx)
-
-void bch2_fs_async_obj_debugfs_init(struct bch_fs *);
-void bch2_fs_async_obj_exit(struct bch_fs *);
-int bch2_fs_async_obj_init(struct bch_fs *);
-
-#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
-
-#define async_object_list_del(_c, _n, idx) do {} while (0)
-
-static inline int __async_object_list_add(void)
-{
- return 0;
-}
-#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add()
-
-static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {}
-static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {}
-static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; }
-
-#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
-
-#endif /* _BCACHEFS_ASYNC_OBJS_H */
diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h
deleted file mode 100644
index 8d713c0f5841..000000000000
--- a/fs/bcachefs/async_objs_types.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H
-#define _BCACHEFS_ASYNC_OBJS_TYPES_H
-
-#define BCH_ASYNC_OBJ_LISTS() \
- x(promote) \
- x(rbio) \
- x(write_op) \
- x(btree_read_bio) \
- x(btree_write_bio)
-
-enum bch_async_obj_lists {
-#define x(n) BCH_ASYNC_OBJ_LIST_##n,
- BCH_ASYNC_OBJ_LISTS()
-#undef x
- BCH_ASYNC_OBJ_NR
-};
-
-struct async_obj_list {
- struct fast_list list;
- void (*obj_to_text)(struct printbuf *, void *);
- unsigned idx;
-};
-
-#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
deleted file mode 100644
index 77d93beb3c8f..000000000000
--- a/fs/bcachefs/backpointers.c
+++ /dev/null
@@ -1,1391 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "progress.h"
-#include "recovery_passes.h"
-
-#include <linux/mm.h>
-
-static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64);
-
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
- return (struct bbpos) {
- .btree = bp.btree_id,
- .pos = bp.pos,
- };
-}
-
-int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- int ret = 0;
-
- bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
- c, backpointer_level_bad,
- "backpointer level bad: %u >= %u",
- bp.v->level, BTREE_MAX_DEPTH);
-
- bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID,
- c, backpointer_dev_bad,
- "backpointer for BCH_SB_MEMBER_INVALID");
-fsck_err:
- return ret;
-}
-
-void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
- struct bch_dev *ca;
- u32 bucket_offset;
- struct bpos bucket;
- scoped_guard(rcu) {
- ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
- if (ca)
- bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
- }
-
- if (ca)
- prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
- else
- prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
-
- bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
- prt_str(out, " data_type=");
- bch2_prt_data_type(out, bp.v->data_type);
- prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
- (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
- bp.v->bucket_len,
- bp.v->bucket_gen);
- bch2_bpos_to_text(out, bp.v->pos);
-}
-
-void bch2_backpointer_swab(struct bkey_s k)
-{
- struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
-
- bp.v->bucket_len = swab32(bp.v->bucket_len);
- bch2_bpos_swab(&bp.v->pos);
-}
-
-static bool extent_matches_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct bkey_s_c_backpointer bp)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bkey_i_backpointer bp2;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2);
-
- if (bpos_eq(bp.k->p, bp2.k.p) &&
- !memcmp(bp.v, &bp2.v, sizeof(bp2.v)))
- return true;
- }
-
- return false;
-}
-
-static noinline int backpointer_mod_err(struct btree_trans *trans,
- struct bkey_s_c orig_k,
- struct bkey_i_backpointer *new_bp,
- struct bkey_s_c found_bp,
- bool insert)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- bool will_check = c->recovery.passes_to_run &
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
- int ret = 0;
-
- if (insert) {
- prt_printf(&buf, "existing backpointer found when inserting ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
- prt_newline(&buf);
- printbuf_indent_add(&buf, 2);
-
- prt_printf(&buf, "found ");
- bch2_bkey_val_to_text(&buf, c, found_bp);
- prt_newline(&buf);
-
- prt_printf(&buf, "for ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- } else if (!will_check) {
- prt_printf(&buf, "backpointer not found when deleting\n");
- printbuf_indent_add(&buf, 2);
-
- prt_printf(&buf, "searching for ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i));
- prt_newline(&buf);
-
- prt_printf(&buf, "got ");
- bch2_bkey_val_to_text(&buf, c, found_bp);
- prt_newline(&buf);
-
- prt_printf(&buf, "for ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- }
-
- if (!will_check && __bch2_inconsistent_error(c, &buf))
- ret = bch_err_throw(c, erofs_unfixed_errors);
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bkey_s_c orig_k,
- struct bkey_i_backpointer *bp,
- bool insert)
-{
- struct btree_iter bp_iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
- bp->k.p,
- BTREE_ITER_intent|
- BTREE_ITER_slots|
- BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (insert
- ? k.k->type
- : (k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) {
- ret = backpointer_mod_err(trans, orig_k, bp, k, insert);
- if (ret)
- goto err;
- }
-
- if (!insert) {
- bp->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp->k, 0);
- }
-
- ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &bp_iter);
- return ret;
-}
-
-static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
-{
- return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
- ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
- : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
- struct bkey_s_c visiting_k,
- struct bkey_buf *last_flushed)
-{
- return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
- ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
- : 0;
-}
-
-static int backpointer_target_not_found(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct bkey_s_c target_k,
- struct bkey_buf *last_flushed,
- bool commit)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- /*
- * If we're using the btree write buffer, the backpointer we were
- * looking at may have already been deleted - failure to find what it
- * pointed to is not an error:
- */
- ret = last_flushed
- ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed)
- : 0;
- if (ret)
- return ret;
-
- prt_printf(&buf, "backpointer doesn't match %s it points to:\n",
- bp.v->level ? "btree node" : "extent");
- bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, target_k);
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
- if (p.ptr.dev == bp.k->p.inode) {
- prt_newline(&buf);
- struct bkey_i_backpointer bp2;
- bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
- }
-
- if (fsck_err(trans, backpointer_to_missing_ptr,
- "%s", buf.buf)) {
- ret = bch2_backpointer_del(trans, bp.k->p);
- if (ret || !commit)
- goto out;
-
- /*
- * Normally, on transaction commit from inside a transaction,
- * we'll return -BCH_ERR_transaction_restart_nested, since a
- * transaction commit invalidates pointers given out by peek().
- *
- * However, since we're updating a write buffer btree, if we
- * return a transaction restart and loop we won't see that the
- * backpointer has been deleted without an additional write
- * buffer flush - and those are expensive.
- *
- * So we're relying on the caller immediately advancing to the
- * next backpointer and starting a new transaction immediately
- * after backpointer_get_key() returns NULL:
- */
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- }
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- struct bkey_buf *last_flushed,
- bool commit)
-{
- struct bch_fs *c = trans->c;
-
- BUG_ON(!bp.v->level);
-
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0,
- bp.v->level - 1,
- 0);
- struct btree *b = bch2_btree_iter_peek_node(trans, iter);
- if (IS_ERR_OR_NULL(b))
- goto err;
-
- BUG_ON(b->c.level != bp.v->level - 1);
-
- if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
- bkey_i_to_s_c(&b->key), bp))
- return b;
-
- if (btree_node_will_make_reachable(b)) {
- b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
- } else {
- int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
- last_flushed, commit);
- b = ret ? ERR_PTR(ret) : NULL;
- }
-err:
- bch2_trans_iter_exit(trans, iter);
- return b;
-}
-
-static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- unsigned iter_flags,
- struct bkey_buf *last_flushed,
- bool commit)
-{
- struct bch_fs *c = trans->c;
-
- if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
- return bkey_s_c_null;
-
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0,
- bp.v->level,
- iter_flags);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- /*
- * peek_slot() doesn't normally return NULL - except when we ask for a
- * key at a btree level that doesn't exist.
- *
- * We may want to revisit this and change peek_slot():
- */
- if (!k.k) {
- bkey_init(&iter->k);
- iter->k.p = bp.v->pos;
- k.k = &iter->k;
- }
-
- if (k.k &&
- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
-
- if (!bp.v->level) {
- int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
- } else {
- struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
- if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
- return bkey_s_c_null;
- if (IS_ERR_OR_NULL(b))
- return ((struct bkey_s_c) { .k = ERR_CAST(b) });
-
- return bkey_i_to_s_c(&b->key);
- }
-}
-
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- struct bkey_buf *last_flushed)
-{
- return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
-}
-
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- unsigned iter_flags,
- struct bkey_buf *last_flushed)
-{
- return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
-}
-
-static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
- struct bkey_buf *last_flushed)
-{
- if (k.k->type != KEY_TYPE_backpointer)
- return 0;
-
- struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = {};
- struct bkey_s_c alloc_k;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bpos bucket;
- if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
- ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
- if (ret)
- goto out;
-
- if (fsck_err(trans, backpointer_to_missing_device,
- "backpointer for missing device:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_backpointer_del(trans, k.k->p);
- goto out;
- }
-
- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0);
- ret = bkey_err(alloc_k);
- if (ret)
- goto out;
-
- if (alloc_k.k->type != KEY_TYPE_alloc_v4) {
- ret = bch2_backpointers_maybe_flush(trans, k, last_flushed);
- if (ret)
- goto out;
-
- if (fsck_err(trans, backpointer_to_missing_alloc,
- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
- alloc_iter.pos.inode, alloc_iter.pos.offset,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_backpointer_del(trans, k.k->p);
- }
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &alloc_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-/* verify that every backpointer has a corresponding alloc key */
-int bch2_check_btree_backpointers(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-}
-
-struct extents_to_bp_state {
- struct bpos bp_start;
- struct bpos bp_end;
- struct bkey_buf last_flushed;
-};
-
-static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
- struct bkey_s_c extent, unsigned dev)
-{
- struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- bch2_bkey_drop_device(bkey_i_to_s(n), dev);
- return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-static int check_extent_checksum(struct btree_trans *trans,
- enum btree_id btree, struct bkey_s_c extent,
- enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct printbuf buf = PRINTBUF;
- void *data_buf = NULL;
- struct bio *bio = NULL;
- size_t bytes;
- int ret = 0;
-
- if (bkey_is_btree_ptr(extent.k))
- return false;
-
- bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
- if (p.ptr.dev == dev)
- goto found;
- BUG();
-found:
- if (!p.crc.csum_type)
- return false;
-
- bytes = p.crc.compressed_size << 9;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ,
- BCH_DEV_READ_REF_check_extent_checksums);
- if (!ca)
- return false;
-
- data_buf = kvmalloc(bytes, GFP_KERNEL);
- if (!data_buf) {
- ret = -ENOMEM;
- goto err;
- }
-
- bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
- bio->bi_iter.bi_sector = p.ptr.offset;
- bch2_bio_map(bio, data_buf, bytes);
- ret = submit_bio_wait(bio);
- if (ret)
- goto err;
-
- prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n");
- bch2_btree_id_to_text(&buf, btree);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, extent);
- prt_newline(&buf);
- bch2_btree_id_to_text(&buf, o_btree);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, extent2);
-
- struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
- struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
- if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
- trans, dup_backpointer_to_bad_csum_extent,
- "%s", buf.buf))
- ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
-fsck_err:
-err:
- if (bio)
- bio_put(bio);
- kvfree(data_buf);
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_check_extent_checksums);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int check_bp_exists(struct btree_trans *trans,
- struct extents_to_bp_state *s,
- struct bkey_i_backpointer *bp,
- struct bkey_s_c orig_k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter other_extent_iter = {};
- struct printbuf buf = PRINTBUF;
-
- if (bpos_lt(bp->k.p, s->bp_start) ||
- bpos_gt(bp->k.p, s->bp_end))
- return 0;
-
- struct btree_iter bp_iter;
- struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0);
- int ret = bkey_err(bp_k);
- if (ret)
- goto err;
-
- if (bp_k.k->type != KEY_TYPE_backpointer ||
- memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
- if (ret)
- goto err;
-
- goto check_existing_bp;
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &other_extent_iter);
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- return ret;
-check_existing_bp:
- /* Do we have a backpointer for a different extent? */
- if (bp_k.k->type != KEY_TYPE_backpointer)
- goto missing;
-
- struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
-
- struct bkey_s_c other_extent =
- __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
- ret = bkey_err(other_extent);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- ret = 0;
- if (ret)
- goto err;
-
- if (!other_extent.k)
- goto missing;
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
- if (ca) {
- struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
- bkey_for_each_ptr(other_extent_ptrs, ptr)
- if (ptr->dev == bp->k.p.inode &&
- dev_ptr_stale_rcu(ca, ptr)) {
- rcu_read_unlock();
- ret = drop_dev_and_update(trans, other_bp.v->btree_id,
- other_extent, bp->k.p.inode);
- if (ret)
- goto err;
- goto out;
- }
- }
- rcu_read_unlock();
-
- if (bch2_extents_match(orig_k, other_extent)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, other_extent);
- bch_err(c, "%s", buf.buf);
-
- if (other_extent.k->size <= orig_k.k->size) {
- ret = drop_dev_and_update(trans, other_bp.v->btree_id,
- other_extent, bp->k.p.inode);
- if (ret)
- goto err;
- goto out;
- } else {
- ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode);
- if (ret)
- goto err;
- goto missing;
- }
- }
-
- ret = check_extent_checksum(trans,
- other_bp.v->btree_id, other_extent,
- bp->v.btree_id, orig_k,
- bp->k.p.inode);
- if (ret < 0)
- goto err;
- if (ret) {
- ret = 0;
- goto missing;
- }
-
- ret = check_extent_checksum(trans, bp->v.btree_id, orig_k,
- other_bp.v->btree_id, other_extent, bp->k.p.inode);
- if (ret < 0)
- goto err;
- if (ret) {
- ret = 0;
- goto out;
- }
-
- printbuf_reset(&buf);
- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode);
- bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, other_extent);
- bch_err(c, "%s", buf.buf);
- ret = bch_err_throw(c, fsck_repair_unimplemented);
- goto err;
-missing:
- printbuf_reset(&buf);
- prt_str(&buf, "missing backpointer\nfor: ");
- bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\nwant: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
- prt_printf(&buf, "\ngot: ");
- bch2_bkey_val_to_text(&buf, c, bp_k);
-
- if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
- ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true);
-
- goto out;
-}
-
-static int check_extent_to_backpointers(struct btree_trans *trans,
- struct extents_to_bp_state *s,
- enum btree_id btree, unsigned level,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
- continue;
-
- bool empty;
- {
- /* scoped_guard() is a loop, so it breaks continue */
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- if (!ca)
- continue;
-
- if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
- continue;
-
- u64 b = PTR_BUCKET_NR(ca, &p.ptr);
- if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
- continue;
-
- empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
- }
-
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
-
- int ret = !empty
- ? check_bp_exists(trans, s, &bp, k)
- : bch2_bucket_backpointer_mod(trans, k, &bp, true);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int check_btree_root_to_backpointers(struct btree_trans *trans,
- struct extents_to_bp_state *s,
- enum btree_id btree_id,
- int *level)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct btree *b;
- struct bkey_s_c k;
- int ret;
-retry:
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
- 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
- b = bch2_btree_iter_peek_node(trans, &iter);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err;
-
- if (b != btree_node_root(c, b)) {
- bch2_trans_iter_exit(trans, &iter);
- goto retry;
- }
-
- *level = b->c.level;
-
- k = bkey_i_to_s_c(&b->key);
- ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static u64 mem_may_pin_bytes(struct bch_fs *c)
-{
- struct sysinfo i;
- si_meminfo(&i);
-
- u64 mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
-}
-
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
-{
- return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
-}
-
-static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
- u64 btree_leaf_mask,
- u64 btree_interior_mask,
- struct bbpos start, struct bbpos *end)
-{
- struct bch_fs *c = trans->c;
- s64 mem_may_pin = mem_may_pin_bytes(c);
- int ret = 0;
-
- bch2_btree_cache_unpin(c);
-
- btree_interior_mask |= btree_leaf_mask;
-
- c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
- c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
- c->btree_cache.pinned_nodes_start = start;
- c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
-
- for (enum btree_id btree = start.btree;
- btree < BTREE_ID_NR && !ret;
- btree++) {
- unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
-
- if (!(BIT_ULL(btree) & btree_leaf_mask) &&
- !(BIT_ULL(btree) & btree_interior_mask))
- continue;
-
- ret = __for_each_btree_node(trans, iter, btree,
- btree == start.btree ? start.pos : POS_MIN,
- 0, depth, BTREE_ITER_prefetch, b, ({
- mem_may_pin -= btree_buf_bytes(b);
- if (mem_may_pin <= 0) {
- c->btree_cache.pinned_nodes_end = *end =
- BBPOS(btree, b->key.k.p);
- break;
- }
- bch2_node_pin(c, b);
- 0;
- }));
- }
-
- return ret;
-}
-
-static inline int bch2_fs_going_ro(struct bch_fs *c)
-{
- return test_bit(BCH_FS_going_ro, &c->flags)
- ? -EROFS
- : 0;
-}
-
-static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct extents_to_bp_state *s)
-{
- struct bch_fs *c = trans->c;
- struct progress_indicator_state progress;
- int ret = 0;
-
- bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
-
- for (enum btree_id btree_id = 0;
- btree_id < btree_id_nr_alive(c);
- btree_id++) {
- int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- check_btree_root_to_backpointers(trans, s, btree_id, &level));
- if (ret)
- return ret;
-
- while (level >= depth) {
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level,
- BTREE_ITER_prefetch);
-
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
- bch2_fs_going_ro(c) ?:
- check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- }));
- if (ret)
- return ret;
-
- --level;
- }
- }
-
- return 0;
-}
-
-enum alloc_sector_counter {
- ALLOC_dirty,
- ALLOC_cached,
- ALLOC_stripe,
- ALLOC_SECTORS_NR
-};
-
-static int data_type_to_alloc_counter(enum bch_data_type t)
-{
- switch (t) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- return ALLOC_dirty;
- case BCH_DATA_cached:
- return ALLOC_cached;
- case BCH_DATA_stripe:
- case BCH_DATA_parity:
- return ALLOC_stripe;
- default:
- return -1;
- }
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
-
-static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
- bool *had_mismatch,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
- bool need_commit = false;
-
- *had_mismatch = false;
-
- if (a->data_type == BCH_DATA_sb ||
- a->data_type == BCH_DATA_journal ||
- a->data_type == BCH_DATA_parity)
- return 0;
-
- u32 sectors[ALLOC_SECTORS_NR];
- memset(sectors, 0, sizeof(sectors));
-
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p);
- if (!ca)
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c bp_k;
- int ret = 0;
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, alloc_k.k->p),
- bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) {
- if (bp_k.k->type != KEY_TYPE_backpointer)
- continue;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
- (bp.v->bucket_gen != a->gen ||
- bp.v->pad)) {
- ret = bch2_backpointer_del(trans, bp_k.k->p);
- if (ret)
- break;
-
- need_commit = true;
- continue;
- }
-
- if (bp.v->bucket_gen != a->gen)
- continue;
-
- int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
- if (alloc_counter < 0)
- continue;
-
- sectors[alloc_counter] += bp.v->bucket_len;
- };
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
-
- if (need_commit) {
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
- }
-
- if (sectors[ALLOC_dirty] != a->dirty_sectors ||
- sectors[ALLOC_cached] != a->cached_sectors ||
- sectors[ALLOC_stripe] != a->stripe_sectors) {
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
- ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
- if (ret)
- goto err;
- }
-
- if (sectors[ALLOC_dirty] > a->dirty_sectors ||
- sectors[ALLOC_cached] > a->cached_sectors ||
- sectors[ALLOC_stripe] > a->stripe_sectors) {
- ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
- bch_err_throw(c, transaction_restart_nested);
- goto err;
- }
-
- bool empty = (sectors[ALLOC_dirty] +
- sectors[ALLOC_stripe] +
- sectors[ALLOC_cached]) == 0;
-
- ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch,
- alloc_k.k->p.offset) ?:
- (empty
- ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty,
- alloc_k.k->p.offset)
- : 0);
-
- *had_mismatch = true;
- }
-err:
- bch2_dev_put(ca);
- return ret;
-}
-
-static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr_v2: {
- bool ret = false;
-
- guard(rcu)();
- struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
- while (pos.inode <= k.k->p.inode) {
- if (pos.inode >= c->sb.nr_devices)
- break;
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- if (!ca)
- goto next;
-
- struct bpos bucket = bp_pos_to_bucket(ca, pos);
- u64 next = ca->mi.nbuckets;
-
- unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
- if (bitmap)
- next = min_t(u64, next,
- find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
-
- bucket.offset = next;
- if (bucket.offset == ca->mi.nbuckets)
- goto next;
-
- ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p);
- if (ret)
- break;
-next:
- pos = SPOS(pos.inode + 1, 0, 0);
- }
-
- return ret;
- }
- case KEY_TYPE_btree_ptr:
- return true;
- default:
- return false;
- }
-}
-
-static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
- enum btree_id btree, unsigned level)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err;
-
- if (b)
- bch2_node_pin(trans->c, b);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans,
- struct bpos start, struct bpos *end)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- struct bkey_buf tmp;
- bch2_bkey_buf_init(&tmp);
-
- bch2_btree_cache_unpin(c);
-
- *end = SPOS_MAX;
-
- s64 mem_may_pin = mem_may_pin_bytes(c);
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
- 0, 1, BTREE_ITER_prefetch);
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- if (!backpointer_node_has_missing(c, k))
- continue;
-
- mem_may_pin -= c->opts.btree_node_size;
- if (mem_may_pin <= 0)
- break;
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- struct btree_path *path = btree_iter_path(trans, &iter);
-
- BUG_ON(path->level != 1);
-
- bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1);
- }));
- if (ret)
- return ret;
-
- struct bpos pinned = SPOS_MAX;
- mem_may_pin = mem_may_pin_bytes(c);
- bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start,
- 0, 1, BTREE_ITER_prefetch);
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- if (!backpointer_node_has_missing(c, k))
- continue;
-
- mem_may_pin -= c->opts.btree_node_size;
- if (mem_may_pin <= 0) {
- *end = pinned;
- break;
- }
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- struct btree_path *path = btree_iter_path(trans, &iter);
-
- BUG_ON(path->level != 1);
-
- int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1);
-
- if (!ret2)
- pinned = tmp.k->k.p;
-
- ret;
- }));
- if (ret)
- return ret;
-
- return ret;
-}
-
-int bch2_check_extents_to_backpointers(struct bch_fs *c)
-{
- int ret = 0;
-
- struct btree_trans *trans = bch2_trans_get(c);
- struct extents_to_bp_state s = { .bp_start = POS_MIN };
-
- bch2_bkey_buf_init(&s.last_flushed);
- bkey_init(&s.last_flushed.k->k);
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
- POS_MIN, BTREE_ITER_prefetch, k, ({
- bool had_mismatch;
- bch2_fs_going_ro(c) ?:
- check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed);
- }));
- if (ret)
- goto err;
-
- u64 nr_buckets = 0, nr_mismatches = 0;
- for_each_member_device(c, ca) {
- nr_buckets += ca->mi.nbuckets;
- nr_mismatches += ca->bucket_backpointer_mismatch.nr;
- }
-
- if (!nr_mismatches)
- goto err;
-
- bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches, nr_buckets);
-
- while (1) {
- ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
- if (ret)
- break;
-
- if ( bpos_eq(s.bp_start, POS_MIN) &&
- !bpos_eq(s.bp_end, SPOS_MAX))
- bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
- __func__, btree_nodes_fit_in_ram(c));
-
- if (!bpos_eq(s.bp_start, POS_MIN) ||
- !bpos_eq(s.bp_end, SPOS_MAX)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, s.bp_start);
- prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, s.bp_end);
-
- bch_verbose(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_check_extents_to_backpointers_pass(trans, &s);
- if (ret || bpos_eq(s.bp_end, SPOS_MAX))
- break;
-
- s.bp_start = bpos_successor(s.bp_end);
- }
-
- for_each_member_device(c, ca) {
- bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
- bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
- }
-err:
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&s.last_flushed, c);
- bch2_btree_cache_unpin(c);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans,
- struct bpos bucket,
- bool *had_mismatch,
- struct bkey_buf *last_flushed)
-{
- struct btree_iter alloc_iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter,
- BTREE_ID_alloc, bucket,
- BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed);
- bch2_trans_iter_exit(trans, &alloc_iter);
- return ret;
-}
-
-int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans,
- struct bch_dev *ca, u64 bucket,
- bool copygc,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- bool had_mismatch;
- int ret = lockrestart_do(trans,
- check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket),
- &had_mismatch, last_flushed));
- if (ret || !had_mismatch)
- return ret;
-
- u64 nr = ca->bucket_backpointer_mismatch.nr;
- u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0;
-
- struct printbuf buf = PRINTBUF;
- __bch2_log_msg_start(ca->name, &buf);
-
- prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n",
- bucket, nr, ca->mi.nbuckets);
-
- bch2_run_explicit_recovery_pass(c, &buf,
- BCH_RECOVERY_PASS_check_extents_to_backpointers,
- nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- return 0;
-}
-
-/* backpointers -> extents */
-
-static int check_one_backpointer(struct btree_trans *trans,
- struct bbpos start,
- struct bbpos end,
- struct bkey_s_c bp_k,
- struct bkey_buf *last_flushed)
-{
- if (bp_k.k->type != KEY_TYPE_backpointer)
- return 0;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- struct bbpos pos = bp_to_bbpos(*bp.v);
-
- if (bbpos_cmp(pos, start) < 0 ||
- bbpos_cmp(pos, end) > 0)
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
- int ret = bkey_err(k);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- return 0;
- if (ret)
- return ret;
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int check_bucket_backpointers_to_extents(struct btree_trans *trans,
- struct bch_dev *ca, struct bpos bucket)
-{
- u32 restart_count = trans->restart_count;
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, bucket),
- bucket_pos_to_bp_end(ca, bucket),
- 0, k,
- check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed)
- );
-
- bch2_bkey_buf_exit(&last_flushed, trans->c);
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
- struct bbpos start,
- struct bbpos end)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf last_flushed;
- struct progress_indicator_state progress;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
- bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
-
- int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
- POS_MIN, BTREE_ITER_prefetch, k, ({
- bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
- check_one_backpointer(trans, start, end, k, &last_flushed);
- }));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- return ret;
-}
-
-int bch2_check_backpointers_to_extents(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
- int ret;
-
- while (1) {
- ret = bch2_get_btree_in_memory_pos(trans,
- BIT_ULL(BTREE_ID_extents)|
- BIT_ULL(BTREE_ID_reflink),
- ~0,
- start, &end);
- if (ret)
- break;
-
- if (!bbpos_cmp(start, BBPOS_MIN) &&
- bbpos_cmp(end, BBPOS_MAX))
- bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
- __func__, btree_nodes_fit_in_ram(c));
-
- if (bbpos_cmp(start, BBPOS_MIN) ||
- bbpos_cmp(end, BBPOS_MAX)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "check_backpointers_to_extents(): ");
- bch2_bbpos_to_text(&buf, start);
- prt_str(&buf, "-");
- bch2_bbpos_to_text(&buf, end);
-
- bch_verbose(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
- if (ret || !bbpos_cmp(end, BBPOS_MAX))
- break;
-
- start = bbpos_successor(end);
- }
- bch2_trans_put(trans);
-
- bch2_btree_cache_unpin(c);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit)
-{
- scoped_guard(mutex, &b->lock) {
- if (!b->buckets) {
- b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long), GFP_KERNEL);
- if (!b->buckets)
- return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
- }
-
- b->nr += !__test_and_set_bit(bit, b->buckets);
- }
-
- return 0;
-}
-
-int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
- u64 old_size, u64 new_size)
-{
- scoped_guard(mutex, &b->lock) {
- if (!b->buckets)
- return 0;
-
- unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
- sizeof(unsigned long), GFP_KERNEL);
- if (!n)
- return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
-
- memcpy(n, b->buckets,
- BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
- kvfree(b->buckets);
- b->buckets = n;
- }
-
- return 0;
-}
-
-void bch2_bucket_bitmap_free(struct bucket_bitmap *b)
-{
- mutex_lock(&b->lock);
- kvfree(b->buckets);
- b->buckets = NULL;
- b->nr = 0;
- mutex_unlock(&b->lock);
-}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
deleted file mode 100644
index 7e71afee1ac0..000000000000
--- a/fs/bcachefs/backpointers.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_H
-#define _BCACHEFS_BACKPOINTERS_H
-
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "error.h"
-#include "super.h"
-
-static inline u64 swab40(u64 x)
-{
- return (((x & 0x00000000ffULL) << 32)|
- ((x & 0x000000ff00ULL) << 16)|
- ((x & 0x0000ff0000ULL) >> 0)|
- ((x & 0x00ff000000ULL) >> 16)|
- ((x & 0xff00000000ULL) >> 32));
-}
-
-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
- struct bkey_validate_context);
-void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_backpointer_swab(struct bkey_s);
-
-#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
- .key_validate = bch2_backpointer_validate, \
- .val_to_text = bch2_backpointer_to_text, \
- .swab = bch2_backpointer_swab, \
- .min_val_size = 32, \
-})
-
-#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
-
-/*
- * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
- * btree:
- */
-static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos)
-{
- u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
- return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
-}
-
-static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
- u32 *bucket_offset)
-{
- u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
- return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
-}
-
-static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
-{
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
- if (ca)
- *bucket = bp_pos_to_bucket(ca, bp_pos);
- return ca != NULL;
-}
-
-static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
- struct bpos bucket,
- u64 bucket_offset)
-{
- return POS(bucket.inode,
- (bucket_to_sector(ca, bucket.offset) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-}
-
-/*
- * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
- */
-static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
- struct bpos bucket,
- u64 bucket_offset)
-{
- struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset);
- EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret)));
- return ret;
-}
-
-static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
-{
- return bucket_pos_to_bp(ca, bucket, 0);
-}
-
-static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
-{
- return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
-}
-
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
- struct bkey_s_c,
- struct bkey_i_backpointer *,
- bool);
-
-static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
- struct bkey_s_c orig_k,
- struct bkey_i_backpointer *bp,
- bool insert)
-{
- if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
-
- if (!insert) {
- bp->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp->k, 0);
- }
-
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
-}
-
-static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
- struct extent_ptr_decoded p,
- const union bch_extent_entry *entry)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return BCH_DATA_btree;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- if (p.has_ec)
- return BCH_DATA_stripe;
- if (p.ptr.cached)
- return BCH_DATA_cached;
- else
- return BCH_DATA_user;
- case KEY_TYPE_stripe: {
- const struct bch_extent_ptr *ptr = &entry->ptr;
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- BUG_ON(ptr < s.v->ptrs ||
- ptr >= s.v->ptrs + s.v->nr_blocks);
-
- return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity
- : BCH_DATA_user;
- }
- default:
- BUG();
- }
-}
-
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- struct bkey_i_backpointer *bp)
-{
- bkey_backpointer_init(&bp->k_i);
- bp->k.p.inode = p.ptr.dev;
-
- if (k.k->type != KEY_TYPE_stripe)
- bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
- else {
- /*
- * Put stripe backpointers where they won't collide with the
- * extent backpointers within the stripe:
- */
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
- MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
- }
-
- bp->v = (struct bch_backpointer) {
- .btree_id = btree_id,
- .level = level,
- .data_type = bch2_bkey_ptr_data_type(k, p, entry),
- .bucket_gen = p.ptr.gen,
- .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
- .pos = k.k->p,
- };
-}
-
-struct bkey_buf;
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
- struct btree_iter *, unsigned, struct bkey_buf *);
-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
- struct btree_iter *, struct bkey_buf *);
-
-int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64,
- bool, struct bkey_buf *);
-
-int bch2_check_btree_backpointers(struct bch_fs *);
-int bch2_check_extents_to_backpointers(struct bch_fs *);
-int bch2_check_backpointers_to_extents(struct bch_fs *);
-
-static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
-{
- unsigned long *bitmap = READ_ONCE(b->buckets);
- return bitmap && test_bit(i, bitmap);
-}
-
-int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
-void bch2_bucket_bitmap_free(struct bucket_bitmap *);
-
-#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
deleted file mode 100644
index 63abe17f35ea..000000000000
--- a/fs/bcachefs/bbpos.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_H
-#define _BCACHEFS_BBPOS_H
-
-#include "bbpos_types.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-
-static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
-{
- return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
-}
-
-static inline struct bbpos bbpos_successor(struct bbpos pos)
-{
- if (bpos_cmp(pos.pos, SPOS_MAX)) {
- pos.pos = bpos_successor(pos.pos);
- return pos;
- }
-
- if (pos.btree != BTREE_ID_NR) {
- pos.btree++;
- pos.pos = POS_MIN;
- return pos;
- }
-
- BUG();
-}
-
-static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
-{
- bch2_btree_id_to_text(out, pos.btree);
- prt_char(out, ':');
- bch2_bpos_to_text(out, pos.pos);
-}
-
-#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
deleted file mode 100644
index f63893344f80..000000000000
--- a/fs/bcachefs/bbpos_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BBPOS_TYPES_H
-#define _BCACHEFS_BBPOS_TYPES_H
-
-struct bbpos {
- enum btree_id btree;
- struct bpos pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
- return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
-
-#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
deleted file mode 100644
index ddfacad0f70c..000000000000
--- a/fs/bcachefs/bcachefs.h
+++ /dev/null
@@ -1,1295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_H
-#define _BCACHEFS_H
-
-/*
- * SOME HIGH LEVEL CODE DOCUMENTATION:
- *
- * Bcache mostly works with cache sets, cache devices, and backing devices.
- *
- * Support for multiple cache devices hasn't quite been finished off yet, but
- * it's about 95% plumbed through. A cache set and its cache devices is sort of
- * like a md raid array and its component devices. Most of the code doesn't care
- * about individual cache devices, the main abstraction is the cache set.
- *
- * Multiple cache devices is intended to give us the ability to mirror dirty
- * cached data and metadata, without mirroring clean cached data.
- *
- * Backing devices are different, in that they have a lifetime independent of a
- * cache set. When you register a newly formatted backing device it'll come up
- * in passthrough mode, and then you can attach and detach a backing device from
- * a cache set at runtime - while it's mounted and in use. Detaching implicitly
- * invalidates any cached data for that backing device.
- *
- * A cache set can have multiple (many) backing devices attached to it.
- *
- * There's also flash only volumes - this is the reason for the distinction
- * between struct cached_dev and struct bcache_device. A flash only volume
- * works much like a bcache device that has a backing device, except the
- * "cached" data is always dirty. The end result is that we get thin
- * provisioning with very little additional code.
- *
- * Flash only volumes work but they're not production ready because the moving
- * garbage collector needs more work. More on that later.
- *
- * BUCKETS/ALLOCATION:
- *
- * Bcache is primarily designed for caching, which means that in normal
- * operation all of our available space will be allocated. Thus, we need an
- * efficient way of deleting things from the cache so we can write new things to
- * it.
- *
- * To do this, we first divide the cache device up into buckets. A bucket is the
- * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
- * works efficiently.
- *
- * Each bucket has a 16 bit priority, and an 8 bit generation associated with
- * it. The gens and priorities for all the buckets are stored contiguously and
- * packed on disk (in a linked list of buckets - aside from the superblock, all
- * of bcache's metadata is stored in buckets).
- *
- * The priority is used to implement an LRU. We reset a bucket's priority when
- * we allocate it or on cache it, and every so often we decrement the priority
- * of each bucket. It could be used to implement something more sophisticated,
- * if anyone ever gets around to it.
- *
- * The generation is used for invalidating buckets. Each pointer also has an 8
- * bit generation embedded in it; for a pointer to be considered valid, its gen
- * must match the gen of the bucket it points into. Thus, to reuse a bucket all
- * we have to do is increment its gen (and write its new gen to disk; we batch
- * this up).
- *
- * Bcache is entirely COW - we never write twice to a bucket, even buckets that
- * contain metadata (including btree nodes).
- *
- * THE BTREE:
- *
- * Bcache is in large part design around the btree.
- *
- * At a high level, the btree is just an index of key -> ptr tuples.
- *
- * Keys represent extents, and thus have a size field. Keys also have a variable
- * number of pointers attached to them (potentially zero, which is handy for
- * invalidating the cache).
- *
- * The key itself is an inode:offset pair. The inode number corresponds to a
- * backing device or a flash only volume. The offset is the ending offset of the
- * extent within the inode - not the starting offset; this makes lookups
- * slightly more convenient.
- *
- * Pointers contain the cache device id, the offset on that device, and an 8 bit
- * generation number. More on the gen later.
- *
- * Index lookups are not fully abstracted - cache lookups in particular are
- * still somewhat mixed in with the btree code, but things are headed in that
- * direction.
- *
- * Updates are fairly well abstracted, though. There are two different ways of
- * updating the btree; insert and replace.
- *
- * BTREE_INSERT will just take a list of keys and insert them into the btree -
- * overwriting (possibly only partially) any extents they overlap with. This is
- * used to update the index after a write.
- *
- * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
- * overwriting a key that matches another given key. This is used for inserting
- * data into the cache after a cache miss, and for background writeback, and for
- * the moving garbage collector.
- *
- * There is no "delete" operation; deleting things from the index is
- * accomplished by either by invalidating pointers (by incrementing a bucket's
- * gen) or by inserting a key with 0 pointers - which will overwrite anything
- * previously present at that location in the index.
- *
- * This means that there are always stale/invalid keys in the btree. They're
- * filtered out by the code that iterates through a btree node, and removed when
- * a btree node is rewritten.
- *
- * BTREE NODES:
- *
- * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
- * free smaller than a bucket - so, that's how big our btree nodes are.
- *
- * (If buckets are really big we'll only use part of the bucket for a btree node
- * - no less than 1/4th - but a bucket still contains no more than a single
- * btree node. I'd actually like to change this, but for now we rely on the
- * bucket's gen for deleting btree nodes when we rewrite/split a node.)
- *
- * Anyways, btree nodes are big - big enough to be inefficient with a textbook
- * btree implementation.
- *
- * The way this is solved is that btree nodes are internally log structured; we
- * can append new keys to an existing btree node without rewriting it. This
- * means each set of keys we write is sorted, but the node is not.
- *
- * We maintain this log structure in memory - keeping 1Mb of keys sorted would
- * be expensive, and we have to distinguish between the keys we have written and
- * the keys we haven't. So to do a lookup in a btree node, we have to search
- * each sorted set. But we do merge written sets together lazily, so the cost of
- * these extra searches is quite low (normally most of the keys in a btree node
- * will be in one big set, and then there'll be one or two sets that are much
- * smaller).
- *
- * This log structure makes bcache's btree more of a hybrid between a
- * conventional btree and a compacting data structure, with some of the
- * advantages of both.
- *
- * GARBAGE COLLECTION:
- *
- * We can't just invalidate any bucket - it might contain dirty data or
- * metadata. If it once contained dirty data, other writes might overwrite it
- * later, leaving no valid pointers into that bucket in the index.
- *
- * Thus, the primary purpose of garbage collection is to find buckets to reuse.
- * It also counts how much valid data it each bucket currently contains, so that
- * allocation can reuse buckets sooner when they've been mostly overwritten.
- *
- * It also does some things that are really internal to the btree
- * implementation. If a btree node contains pointers that are stale by more than
- * some threshold, it rewrites the btree node to avoid the bucket's generation
- * wrapping around. It also merges adjacent btree nodes if they're empty enough.
- *
- * THE JOURNAL:
- *
- * Bcache's journal is not necessary for consistency; we always strictly
- * order metadata writes so that the btree and everything else is consistent on
- * disk in the event of an unclean shutdown, and in fact bcache had writeback
- * caching (with recovery from unclean shutdown) before journalling was
- * implemented.
- *
- * Rather, the journal is purely a performance optimization; we can't complete a
- * write until we've updated the index on disk, otherwise the cache would be
- * inconsistent in the event of an unclean shutdown. This means that without the
- * journal, on random write workloads we constantly have to update all the leaf
- * nodes in the btree, and those writes will be mostly empty (appending at most
- * a few keys each) - highly inefficient in terms of amount of metadata writes,
- * and it puts more strain on the various btree resorting/compacting code.
- *
- * The journal is just a log of keys we've inserted; on startup we just reinsert
- * all the keys in the open journal entries. That means that when we're updating
- * a node in the btree, we can wait until a 4k block of keys fills up before
- * writing them out.
- *
- * For simplicity, we only journal updates to leaf nodes; updates to parent
- * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
- * the complexity to deal with journalling them (in particular, journal replay)
- * - updates to non leaf nodes just happen synchronously (see btree_split()).
- */
-
-#undef pr_fmt
-#ifdef __KERNEL__
-#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
-#else
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define ENUMERATED_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...) 0
-#endif
-
-#define race_fault(...) dynamic_fault("bcachefs:race")
-
-#include <linux/backing-dev-defs.h>
-#include <linux/bug.h>
-#include <linux/bio.h>
-#include <linux/closure.h>
-#include <linux/kobject.h>
-#include <linux/list.h>
-#include <linux/math64.h>
-#include <linux/mutex.h>
-#include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
-#include <linux/refcount.h>
-#include <linux/rhashtable.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/seqlock.h>
-#include <linux/shrinker.h>
-#include <linux/srcu.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/zstd.h>
-#include <linux/unicode.h>
-
-#include "bcachefs_format.h"
-#include "btree_journal_iter_types.h"
-#include "disk_accounting_types.h"
-#include "errcode.h"
-#include "fast_list.h"
-#include "fifo.h"
-#include "nocow_locking_types.h"
-#include "opts.h"
-#include "sb-errors_types.h"
-#include "seqmutex.h"
-#include "snapshot_types.h"
-#include "time_stats.h"
-#include "util.h"
-
-#include "alloc_types.h"
-#include "async_objs_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "enumerated_ref_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "recovery_passes_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
-#include "trace.h"
-
-#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
-
-#define trace_and_count(_c, _name, ...) \
-do { \
- count_event(_c, _name); \
- trace_##_name(__VA_ARGS__); \
-} while (0)
-
-#define bch2_fs_init_fault(name) \
- dynamic_fault("bcachefs:bch_fs_init:" name)
-#define bch2_meta_read_fault(name) \
- dynamic_fault("bcachefs:meta:read:" name)
-#define bch2_meta_write_fault(name) \
- dynamic_fault("bcachefs:meta:write:" name)
-
-#ifdef __KERNEL__
-#define BCACHEFS_LOG_PREFIX
-#endif
-
-#ifdef BCACHEFS_LOG_PREFIX
-
-#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
-#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
- "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
-
-#else
-
-#define bch2_log_msg(_c, fmt) fmt
-#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name)
-#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
-#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
-#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
- "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
-
-#endif
-
-#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
-
-void bch2_print_str(struct bch_fs *, const char *, const char *);
-
-__printf(2, 3)
-void bch2_print_opts(struct bch_opts *, const char *, ...);
-
-__printf(2, 3)
-void __bch2_print(struct bch_fs *c, const char *fmt, ...);
-
-#define maybe_dev_to_fs(_c) _Generic((_c), \
- struct bch_dev *: ((struct bch_dev *) (_c))->fs, \
- struct bch_fs *: (_c))
-
-#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
-
-#define bch2_print_ratelimited(_c, ...) \
-do { \
- static DEFINE_RATELIMIT_STATE(_rs, \
- DEFAULT_RATELIMIT_INTERVAL, \
- DEFAULT_RATELIMIT_BURST); \
- \
- if (__ratelimit(&_rs)) \
- bch2_print(_c, __VA_ARGS__); \
-} while (0)
-
-#define bch2_print_str_ratelimited(_c, ...) \
-do { \
- static DEFINE_RATELIMIT_STATE(_rs, \
- DEFAULT_RATELIMIT_INTERVAL, \
- DEFAULT_RATELIMIT_BURST); \
- \
- if (__ratelimit(&_rs)) \
- bch2_print_str(_c, __VA_ARGS__); \
-} while (0)
-
-#define bch_info(c, fmt, ...) \
- bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_info_ratelimited(c, fmt, ...) \
- bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_notice(c, fmt, ...) \
- bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn(c, fmt, ...) \
- bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn_ratelimited(c, fmt, ...) \
- bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
-
-#define bch_err(c, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev(ca, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset(ca, _offset, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum(c, _inum, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
- bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-#define bch_err_ratelimited(c, fmt, ...) \
- bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err_dev_ratelimited(ca, fmt, ...) \
- bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
-#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
- bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
-#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
- bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
-#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
- bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
-
-static inline bool should_print_err(int err)
-{
- return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
-}
-
-#define bch_err_fn(_c, _ret) \
-do { \
- if (should_print_err(_ret)) \
- bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_fn_ratelimited(_c, _ret) \
-do { \
- if (should_print_err(_ret)) \
- bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
-} while (0)
-
-#define bch_err_msg(_c, _ret, _msg, ...) \
-do { \
- if (should_print_err(_ret)) \
- bch_err(_c, "%s(): error " _msg " %s", __func__, \
- ##__VA_ARGS__, bch2_err_str(_ret)); \
-} while (0)
-
-#define bch_verbose(c, fmt, ...) \
-do { \
- if ((c)->opts.verbose) \
- bch_info(c, fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define bch_verbose_ratelimited(c, fmt, ...) \
-do { \
- if ((c)->opts.verbose) \
- bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define pr_verbose_init(opts, fmt, ...) \
-do { \
- if (opt_get(opts, verbose)) \
- pr_info(fmt, ##__VA_ARGS__); \
-} while (0)
-
-static inline int __bch2_err_trace(struct bch_fs *c, int err)
-{
- trace_error_throw(c, err, _THIS_IP_);
- return err;
-}
-
-#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
-
-/* Parameters that are useful for debugging, but should always be compiled in: */
-#define BCH_DEBUG_PARAMS_ALWAYS() \
- BCH_DEBUG_PARAM(key_merging_disabled, \
- "Disables merging of extents") \
- BCH_DEBUG_PARAM(btree_node_merging_disabled, \
- "Disables merging of btree nodes") \
- BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
- "Causes mark and sweep to compact and rewrite every " \
- "btree node it traverses") \
- BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
- "Disables rewriting of btree nodes during mark and sweep")\
- BCH_DEBUG_PARAM(btree_shrinker_disabled, \
- "Disables the shrinker callback for the btree node cache")\
- BCH_DEBUG_PARAM(verify_btree_ondisk, \
- "Reread btree nodes at various points to verify the " \
- "mergesort in the read path against modifications " \
- "done in memory") \
- BCH_DEBUG_PARAM(verify_all_btree_replicas, \
- "When reading btree nodes, read all replicas and " \
- "compare them") \
- BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
- "Don't use the write buffer for backpointers, enabling "\
- "extra runtime checks") \
- BCH_DEBUG_PARAM(debug_check_btree_locking, \
- "Enable additional asserts for btree locking") \
- BCH_DEBUG_PARAM(debug_check_iterators, \
- "Enables extra verification for btree iterators") \
- BCH_DEBUG_PARAM(debug_check_bset_lookups, \
- "Enables extra verification for bset lookups") \
- BCH_DEBUG_PARAM(debug_check_btree_accounting, \
- "Verify btree accounting for keys within a node") \
- BCH_DEBUG_PARAM(debug_check_bkey_unpack, \
- "Enables extra verification for bkey unpack")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG() \
- BCH_DEBUG_PARAM(journal_seq_verify, \
- "Store the journal sequence number in the version " \
- "number of every btree key, and verify that btree " \
- "update ordering is preserved during recovery") \
- BCH_DEBUG_PARAM(inject_invalid_keys, \
- "Store the journal sequence number in the version " \
- "number of every btree key, and verify that btree " \
- "update ordering is preserved during recovery") \
- BCH_DEBUG_PARAM(test_alloc_startup, \
- "Force allocator startup to use the slowpath where it" \
- "can't find enough free buckets without invalidating" \
- "cached data") \
- BCH_DEBUG_PARAM(force_reconstruct_read, \
- "Force reads to use the reconstruct path, when reading" \
- "from erasure coded extents") \
- BCH_DEBUG_PARAM(test_restart_gc, \
- "Test restarting mark and sweep gc when bucket gens change")
-
-#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
-#else
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
-#endif
-
-#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name;
-BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_TIME_STATS() \
- x(btree_node_mem_alloc) \
- x(btree_node_split) \
- x(btree_node_compact) \
- x(btree_node_merge) \
- x(btree_node_sort) \
- x(btree_node_get) \
- x(btree_node_read) \
- x(btree_node_read_done) \
- x(btree_node_write) \
- x(btree_interior_update_foreground) \
- x(btree_interior_update_total) \
- x(btree_gc) \
- x(data_write) \
- x(data_write_to_submit) \
- x(data_write_to_queue) \
- x(data_write_to_btree_update) \
- x(data_write_btree_update) \
- x(data_read) \
- x(data_promote) \
- x(journal_flush_write) \
- x(journal_noflush_write) \
- x(journal_flush_seq) \
- x(blocked_journal_low_on_space) \
- x(blocked_journal_low_on_pin) \
- x(blocked_journal_max_in_flight) \
- x(blocked_journal_max_open) \
- x(blocked_key_cache_flush) \
- x(blocked_allocate) \
- x(blocked_allocate_open_bucket) \
- x(blocked_write_buffer_full) \
- x(nocow_lock_contended)
-
-enum bch_time_stats {
-#define x(name) BCH_TIME_##name,
- BCH_TIME_STATS()
-#undef x
- BCH_TIME_STAT_NR
-};
-
-/* Number of nodes btree coalesce will try to coalesce at once */
-#define GC_MERGE_NODES 4U
-
-/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
-
-/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
-
-#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
-
-struct btree;
-
-struct io_count {
- u64 sectors[2][BCH_DATA_NR];
-};
-
-struct discard_in_flight {
- bool in_progress:1;
- u64 bucket:63;
-};
-
-#define BCH_DEV_READ_REFS() \
- x(bch2_online_devs) \
- x(trans_mark_dev_sbs) \
- x(read_fua_test) \
- x(sb_field_resize) \
- x(write_super) \
- x(journal_read) \
- x(fs_journal_alloc) \
- x(fs_resize_on_mount) \
- x(btree_node_read) \
- x(btree_node_read_all_replicas) \
- x(btree_node_scrub) \
- x(btree_node_write) \
- x(btree_node_scan) \
- x(btree_verify_replicas) \
- x(btree_node_ondisk_to_text) \
- x(io_read) \
- x(check_extent_checksums) \
- x(ec_block)
-
-enum bch_dev_read_ref {
-#define x(n) BCH_DEV_READ_REF_##n,
- BCH_DEV_READ_REFS()
-#undef x
- BCH_DEV_READ_REF_NR,
-};
-
-#define BCH_DEV_WRITE_REFS() \
- x(journal_write) \
- x(journal_do_discards) \
- x(dev_do_discards) \
- x(discard_one_bucket_fast) \
- x(do_invalidates) \
- x(nocow_flush) \
- x(io_write) \
- x(ec_block) \
- x(ec_bucket_zero)
-
-enum bch_dev_write_ref {
-#define x(n) BCH_DEV_WRITE_REF_##n,
- BCH_DEV_WRITE_REFS()
-#undef x
- BCH_DEV_WRITE_REF_NR,
-};
-
-struct bucket_bitmap {
- unsigned long *buckets;
- u64 nr;
- struct mutex lock;
-};
-
-struct bch_dev {
- struct kobject kobj;
-#ifdef CONFIG_BCACHEFS_DEBUG
- atomic_long_t ref;
- bool dying;
- unsigned long last_put;
-#else
- struct percpu_ref ref;
-#endif
- struct completion ref_completion;
- struct enumerated_ref io_ref[2];
-
- struct bch_fs *fs;
-
- u8 dev_idx;
- /*
- * Cached version of this device's member info from superblock
- * Committed by bch2_write_super() -> bch_fs_mi_update()
- */
- struct bch_member_cpu mi;
- atomic64_t errors[BCH_MEMBER_ERROR_NR];
- unsigned long write_errors_start;
-
- __uuid_t uuid;
- char name[BDEVNAME_SIZE];
-
- struct bch_sb_handle disk_sb;
- struct bch_sb *sb_read_scratch;
- int sb_write_error;
- dev_t dev;
- atomic_t flush_seq;
-
- struct bch_devs_mask self;
-
- /*
- * Buckets:
- * Per-bucket arrays are protected by either rcu_read_lock or
- * state_lock, for device resize.
- */
- GENRADIX(struct bucket) buckets_gc;
- struct bucket_gens __rcu *bucket_gens;
- u8 *oldest_gen;
- unsigned long *buckets_nouse;
-
- struct bucket_bitmap bucket_backpointer_mismatch;
- struct bucket_bitmap bucket_backpointer_empty;
-
- struct bch_dev_usage_full __percpu
- *usage;
-
- /* Allocator: */
- u64 alloc_cursor[3];
-
- unsigned nr_open_buckets;
- unsigned nr_partial_buckets;
- unsigned nr_btree_reserve;
-
- struct work_struct invalidate_work;
- struct work_struct discard_work;
- struct mutex discard_buckets_in_flight_lock;
- DARRAY(struct discard_in_flight) discard_buckets_in_flight;
- struct work_struct discard_fast_work;
-
- atomic64_t rebalance_work;
-
- struct journal_device journal;
- u64 prev_journal_sector;
-
- struct work_struct io_error_work;
-
- /* The rest of this all shows up in sysfs */
- atomic64_t cur_latency[2];
- struct bch2_time_stats_quantiles io_latency[2];
-
-#define CONGESTED_MAX 1024
- atomic_t congested;
- u64 congested_last;
-
- struct io_count __percpu *io_done;
-};
-
-/*
- * initial_gc_unfixed
- * error
- * topology error
- */
-
-#define BCH_FS_FLAGS() \
- x(new_fs) \
- x(started) \
- x(clean_recovery) \
- x(btree_running) \
- x(accounting_replay_done) \
- x(may_go_rw) \
- x(rw) \
- x(rw_init_done) \
- x(was_rw) \
- x(stopping) \
- x(emergency_ro) \
- x(going_ro) \
- x(write_disable_complete) \
- x(clean_shutdown) \
- x(in_recovery) \
- x(in_fsck) \
- x(initial_gc_unfixed) \
- x(need_delete_dead_snapshots) \
- x(error) \
- x(topology_error) \
- x(errors_fixed) \
- x(errors_not_fixed) \
- x(no_invalid_checks) \
- x(discard_mount_opt_set) \
-
-enum bch_fs_flags {
-#define x(n) BCH_FS_##n,
- BCH_FS_FLAGS()
-#undef x
-};
-
-struct btree_debug {
- unsigned id;
-};
-
-#define BCH_TRANSACTIONS_NR 128
-
-struct btree_transaction_stats {
- struct bch2_time_stats duration;
- struct bch2_time_stats lock_hold_times;
- struct mutex lock;
- unsigned nr_max_paths;
- unsigned max_mem;
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- darray_trans_kmalloc_trace trans_kmalloc_trace;
-#endif
- char *max_paths_text;
-};
-
-struct bch_fs_pcpu {
- u64 sectors_available;
-};
-
-struct journal_seq_blacklist_table {
- size_t nr;
- struct journal_seq_blacklist_table_entry {
- u64 start;
- u64 end;
- bool dirty;
- } entries[];
-};
-
-struct btree_trans_buf {
- struct btree_trans *trans;
-};
-
-#define BCH_WRITE_REFS() \
- x(journal) \
- x(trans) \
- x(write) \
- x(promote) \
- x(node_rewrite) \
- x(stripe_create) \
- x(stripe_delete) \
- x(reflink) \
- x(fallocate) \
- x(fsync) \
- x(dio_write) \
- x(discard) \
- x(discard_fast) \
- x(check_discard_freespace_key) \
- x(invalidate) \
- x(delete_dead_snapshots) \
- x(gc_gens) \
- x(snapshot_delete_pagecache) \
- x(sysfs) \
- x(btree_write_buffer) \
- x(btree_node_scrub) \
- x(async_recovery_passes) \
- x(ioctl_data)
-
-enum bch_write_ref {
-#define x(n) BCH_WRITE_REF_##n,
- BCH_WRITE_REFS()
-#undef x
- BCH_WRITE_REF_NR,
-};
-
-#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
-
-struct bch_fs {
- struct closure cl;
-
- struct list_head list;
- struct kobject kobj;
- struct kobject counters_kobj;
- struct kobject internal;
- struct kobject opts_dir;
- struct kobject time_stats;
- unsigned long flags;
-
- int minor;
- struct device *chardev;
- struct super_block *vfs_sb;
- dev_t dev;
- char name[40];
- struct stdio_redirect *stdio;
- struct task_struct *stdio_filter;
-
- /* ro/rw, add/remove/resize devices: */
- struct rw_semaphore state_lock;
-
- /* Counts outstanding writes, for clean transition to read-only */
- struct enumerated_ref writes;
- /*
- * Certain operations are only allowed in single threaded mode, during
- * recovery, and we want to assert that this is the case:
- */
- struct task_struct *recovery_task;
-
- /*
- * Analagous to c->writes, for asynchronous ops that don't necessarily
- * need fs to be read-write
- */
- refcount_t ro_ref;
- wait_queue_head_t ro_ref_wait;
-
- struct work_struct read_only_work;
-
- struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
-
- struct bch_accounting_mem accounting;
-
- struct bch_replicas_cpu replicas;
- struct bch_replicas_cpu replicas_gc;
- struct mutex replicas_gc_lock;
-
- struct journal_entry_res btree_root_journal_res;
- struct journal_entry_res clock_journal_res;
-
- struct bch_disk_groups_cpu __rcu *disk_groups;
-
- struct bch_opts opts;
-
- /* Updated by bch2_sb_update():*/
- struct {
- __uuid_t uuid;
- __uuid_t user_uuid;
-
- u16 version;
- u16 version_incompat;
- u16 version_incompat_allowed;
- u16 version_min;
- u16 version_upgrade_complete;
-
- u8 nr_devices;
- u8 clean;
- bool multi_device; /* true if we've ever had more than one device */
-
- u8 encryption_type;
-
- u64 time_base_lo;
- u32 time_base_hi;
- unsigned time_units_per_sec;
- unsigned nsec_per_time_unit;
- u64 features;
- u64 compat;
- u64 recovery_passes_required;
- unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
- u64 btrees_lost_data;
- } sb;
- DARRAY(enum bcachefs_metadata_version)
- incompat_versions_requested;
-
- struct unicode_map *cf_encoding;
-
- struct bch_sb_handle disk_sb;
-
- unsigned short block_bits; /* ilog2(block_size) */
-
- u16 btree_foreground_merge_threshold;
-
- struct closure sb_write;
- struct mutex sb_lock;
-
- /* snapshot.c: */
- struct snapshot_table __rcu *snapshots;
- struct mutex snapshot_table_lock;
- struct rw_semaphore snapshot_create_lock;
-
- struct snapshot_delete snapshot_delete;
- struct work_struct snapshot_wait_for_pagecache_and_delete_work;
- snapshot_id_list snapshots_unlinked;
- struct mutex snapshots_unlinked_lock;
-
- /* BTREE CACHE */
- struct bio_set btree_bio;
- struct workqueue_struct *btree_read_complete_wq;
- struct workqueue_struct *btree_write_submit_wq;
-
- struct btree_root btree_roots_known[BTREE_ID_NR];
- DARRAY(struct btree_root) btree_roots_extra;
- struct mutex btree_root_lock;
-
- struct btree_cache btree_cache;
-
- /*
- * Cache of allocated btree nodes - if we allocate a btree node and
- * don't use it, if we free it that space can't be reused until going
- * _all_ the way through the allocator (which exposes us to a livelock
- * when allocating btree reserves fail halfway through) - instead, we
- * can stick them here:
- */
- struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
- unsigned btree_reserve_cache_nr;
- struct mutex btree_reserve_cache_lock;
-
- mempool_t btree_interior_update_pool;
- struct list_head btree_interior_update_list;
- struct list_head btree_interior_updates_unwritten;
- struct mutex btree_interior_update_lock;
- struct closure_waitlist btree_interior_update_wait;
-
- struct workqueue_struct *btree_interior_update_worker;
- struct work_struct btree_interior_update_work;
-
- struct workqueue_struct *btree_node_rewrite_worker;
- struct list_head btree_node_rewrites;
- struct list_head btree_node_rewrites_pending;
- spinlock_t btree_node_rewrites_lock;
- struct closure_waitlist btree_node_rewrites_wait;
-
- /* btree_io.c: */
- spinlock_t btree_write_error_lock;
- struct btree_write_stats {
- atomic64_t nr;
- atomic64_t bytes;
- } btree_write_stats[BTREE_WRITE_TYPE_NR];
-
- /* btree_iter.c: */
- struct seqmutex btree_trans_lock;
- struct list_head btree_trans_list;
- mempool_t btree_trans_pool;
- mempool_t btree_trans_mem_pool;
- struct btree_trans_buf __percpu *btree_trans_bufs;
-
- struct srcu_struct btree_trans_barrier;
- bool btree_trans_barrier_initialized;
-
- struct btree_key_cache btree_key_cache;
- unsigned btree_key_cache_btrees;
-
- struct btree_write_buffer btree_write_buffer;
-
- struct workqueue_struct *btree_update_wq;
- struct workqueue_struct *btree_write_complete_wq;
- /* copygc needs its own workqueue for index updates.. */
- struct workqueue_struct *copygc_wq;
- /*
- * Use a dedicated wq for write ref holder tasks. Required to avoid
- * dependency problems with other wq tasks that can block on ref
- * draining, such as read-only transition.
- */
- struct workqueue_struct *write_ref_wq;
-
- /* ALLOCATION */
- struct bch_devs_mask online_devs;
- struct bch_devs_mask rw_devs[BCH_DATA_NR];
- unsigned long rw_devs_change_count;
-
- u64 capacity; /* sectors */
- u64 reserved; /* sectors */
-
- /*
- * When capacity _decreases_ (due to a disk being removed), we
- * increment capacity_gen - this invalidates outstanding reservations
- * and forces them to be revalidated
- */
- u32 capacity_gen;
- unsigned bucket_size_max;
-
- atomic64_t sectors_available;
- struct mutex sectors_available_lock;
-
- struct bch_fs_pcpu __percpu *pcpu;
-
- struct percpu_rw_semaphore mark_lock;
-
- seqcount_t usage_lock;
- struct bch_fs_usage_base __percpu *usage;
- u64 __percpu *online_reserved;
-
- unsigned long allocator_last_stuck;
-
- struct io_clock io_clock[2];
-
- /* JOURNAL SEQ BLACKLIST */
- struct journal_seq_blacklist_table *
- journal_seq_blacklist_table;
-
- /* ALLOCATOR */
- spinlock_t freelist_lock;
- struct closure_waitlist freelist_wait;
-
- open_bucket_idx_t open_buckets_freelist;
- open_bucket_idx_t open_buckets_nr_free;
- struct closure_waitlist open_buckets_wait;
- struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
- open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT];
-
- open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
- open_bucket_idx_t open_buckets_partial_nr;
-
- struct write_point btree_write_point;
- struct write_point rebalance_write_point;
-
- struct write_point write_points[WRITE_POINT_MAX];
- struct hlist_head write_points_hash[WRITE_POINT_HASH_NR];
- struct mutex write_points_hash_lock;
- unsigned write_points_nr;
-
- struct buckets_waiting_for_journal buckets_waiting_for_journal;
-
- /* GARBAGE COLLECTION */
- struct work_struct gc_gens_work;
- unsigned long gc_count;
-
- enum btree_id gc_gens_btree;
- struct bpos gc_gens_pos;
-
- /*
- * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
- * has been marked by GC.
- *
- * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
- *
- * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
- * can read without a lock.
- */
- seqcount_t gc_pos_lock;
- struct gc_pos gc_pos;
-
- /*
- * The allocation code needs gc_mark in struct bucket to be correct, but
- * it's not while a gc is in progress.
- */
- struct rw_semaphore gc_lock;
- struct mutex gc_gens_lock;
-
- /* IO PATH */
- struct semaphore io_in_flight;
- struct bio_set bio_read;
- struct bio_set bio_read_split;
- struct bio_set bio_write;
- struct bio_set replica_set;
- struct mutex bio_bounce_pages_lock;
- mempool_t bio_bounce_pages;
- struct bucket_nocow_lock_table
- nocow_locks;
- struct rhashtable promote_table;
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR];
-#endif
-
- mempool_t compression_bounce[2];
- mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
- size_t zstd_workspace_size;
-
- struct bch_key chacha20_key;
- bool chacha20_key_set;
-
- atomic64_t key_version;
-
- mempool_t large_bkey_pool;
-
- /* MOVE.C */
- struct list_head moving_context_list;
- struct mutex moving_context_lock;
-
- /* REBALANCE */
- struct bch_fs_rebalance rebalance;
-
- /* COPYGC */
- struct task_struct *copygc_thread;
- struct write_point copygc_write_point;
- s64 copygc_wait_at;
- s64 copygc_wait;
- bool copygc_running;
- wait_queue_head_t copygc_running_wq;
-
- /* STRIPES: */
- GENRADIX(struct gc_stripe) gc_stripes;
-
- struct hlist_head ec_stripes_new[32];
- spinlock_t ec_stripes_new_lock;
-
- /* ERASURE CODING */
- struct list_head ec_stripe_head_list;
- struct mutex ec_stripe_head_lock;
-
- struct list_head ec_stripe_new_list;
- struct mutex ec_stripe_new_lock;
- wait_queue_head_t ec_stripe_new_wait;
-
- struct work_struct ec_stripe_create_work;
- u64 ec_stripe_hint;
-
- struct work_struct ec_stripe_delete_work;
-
- struct bio_set ec_bioset;
-
- /* REFLINK */
- reflink_gc_table reflink_gc_table;
- size_t reflink_gc_nr;
-
- /* fs.c */
- struct list_head vfs_inodes_list;
- struct mutex vfs_inodes_lock;
- struct rhashtable vfs_inodes_table;
- struct rhltable vfs_inodes_by_inum_table;
-
- /* VFS IO PATH - fs-io.c */
- struct bio_set writepage_bioset;
- struct bio_set dio_write_bioset;
- struct bio_set dio_read_bioset;
- struct bio_set nocow_flush_bioset;
-
- /* QUOTAS */
- struct bch_memquota_type quotas[QTYP_NR];
-
- /* RECOVERY */
- u64 journal_replay_seq_start;
- u64 journal_replay_seq_end;
- struct bch_fs_recovery recovery;
-
- /* DEBUG JUNK */
- struct dentry *fs_debug_dir;
- struct dentry *btree_debug_dir;
- struct dentry *async_obj_dir;
- struct btree_debug btree_debug[BTREE_ID_NR];
- struct btree *verify_data;
- struct btree_node *verify_ondisk;
- struct mutex verify_lock;
-
- /*
- * A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
- */
- mempool_t fill_iter;
-
- mempool_t btree_bounce_pool;
-
- struct journal journal;
- GENRADIX(struct journal_replay *) journal_entries;
- u64 journal_entries_base_seq;
- struct journal_keys journal_keys;
- struct list_head journal_iters;
-
- struct find_btree_nodes found_btree_nodes;
-
- u64 last_bucket_seq_cleanup;
-
- u64 counters_on_mount[BCH_COUNTER_NR];
- u64 __percpu *counters;
-
- struct bch2_time_stats times[BCH_TIME_STAT_NR];
-
- struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
-
- /* ERRORS */
- struct list_head fsck_error_msgs;
- struct mutex fsck_error_msgs_lock;
- bool fsck_alloc_msgs_err;
-
- bch_sb_errors_cpu fsck_error_counts;
- struct mutex fsck_error_counts_lock;
-};
-
-extern struct wait_queue_head bch2_read_only_wait;
-
-static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
-{
- if (test_bit(BCH_FS_stopping, &c->flags))
- return false;
-
- return refcount_inc_not_zero(&c->ro_ref);
-}
-
-static inline void bch2_ro_ref_put(struct bch_fs *c)
-{
- if (refcount_dec_and_test(&c->ro_ref))
- wake_up(&c->ro_ref_wait);
-}
-
-static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
-{
-#ifndef NO_BCACHEFS_FS
- if (c->vfs_sb)
- c->vfs_sb->s_bdi->ra_pages = ra_pages;
-#endif
-}
-
-static inline unsigned bucket_bytes(const struct bch_dev *ca)
-{
- return ca->mi.bucket_size << 9;
-}
-
-static inline unsigned block_bytes(const struct bch_fs *c)
-{
- return c->opts.block_size;
-}
-
-static inline unsigned block_sectors(const struct bch_fs *c)
-{
- return c->opts.block_size >> 9;
-}
-
-static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
-{
- return c->btree_key_cache_btrees & (1U << btree);
-}
-
-static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
-{
- struct timespec64 t;
- s64 sec;
- s32 rem;
-
- time += c->sb.time_base_lo;
-
- sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
-
- set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
-
- return t;
-}
-
-static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
-{
- return (ts.tv_sec * c->sb.time_units_per_sec +
- (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
-}
-
-static inline s64 bch2_current_time(const struct bch_fs *c)
-{
- struct timespec64 now;
-
- ktime_get_coarse_real_ts64(&now);
- return timespec_to_bch2_time(c, now);
-}
-
-static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
-{
- return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
-}
-
-static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
-{
- struct stdio_redirect *stdio = c->stdio;
-
- if (c->stdio_filter && c->stdio_filter != current)
- stdio = NULL;
- return stdio;
-}
-
-static inline unsigned metadata_replicas_required(struct bch_fs *c)
-{
- return min(c->opts.metadata_replicas,
- c->opts.metadata_replicas_required);
-}
-
-static inline unsigned data_replicas_required(struct bch_fs *c)
-{
- return min(c->opts.data_replicas,
- c->opts.data_replicas_required);
-}
-
-#define BKEY_PADDED_ONSTACK(key, pad) \
- struct { struct bkey_i key; __u64 key ## _pad[pad]; }
-
-/*
- * This is needed because discard is both a filesystem option and a device
- * option, and mount options are supposed to apply to that mount and not be
- * persisted, i.e. if it's set as a mount option we can't propagate it to the
- * device.
- */
-static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
-{
- return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
- ? c->opts.discard
- : ca->mi.discard;
-}
-
-static inline bool bch2_fs_casefold_enabled(struct bch_fs *c)
-{
-#ifdef CONFIG_UNICODE
- return !c->opts.casefold_disabled;
-#else
- return false;
-#endif
-}
-
-#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
deleted file mode 100644
index b4a04df5ea95..000000000000
--- a/fs/bcachefs/bcachefs_format.h
+++ /dev/null
@@ -1,1545 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FORMAT_H
-#define _BCACHEFS_FORMAT_H
-
-/*
- * bcachefs on disk data structures
- *
- * OVERVIEW:
- *
- * There are three main types of on disk data structures in bcachefs (this is
- * reduced from 5 in bcache)
- *
- * - superblock
- * - journal
- * - btree
- *
- * The btree is the primary structure; most metadata exists as keys in the
- * various btrees. There are only a small number of btrees, they're not
- * sharded - we have one btree for extents, another for inodes, et cetera.
- *
- * SUPERBLOCK:
- *
- * The superblock contains the location of the journal, the list of devices in
- * the filesystem, and in general any metadata we need in order to decide
- * whether we can start a filesystem or prior to reading the journal/btree
- * roots.
- *
- * The superblock is extensible, and most of the contents of the superblock are
- * in variable length, type tagged fields; see struct bch_sb_field.
- *
- * Backup superblocks do not reside in a fixed location; also, superblocks do
- * not have a fixed size. To locate backup superblocks we have struct
- * bch_sb_layout; we store a copy of this inside every superblock, and also
- * before the first superblock.
- *
- * JOURNAL:
- *
- * The journal primarily records btree updates in the order they occurred;
- * journal replay consists of just iterating over all the keys in the open
- * journal entries and re-inserting them into the btrees.
- *
- * The journal also contains entry types for the btree roots, and blacklisted
- * journal sequence numbers (see journal_seq_blacklist.c).
- *
- * BTREE:
- *
- * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
- * 128k-256k) and log structured. We use struct btree_node for writing the first
- * entry in a given node (offset 0), and struct btree_node_entry for all
- * subsequent writes.
- *
- * After the header, btree node entries contain a list of keys in sorted order.
- * Values are stored inline with the keys; since values are variable length (and
- * keys effectively are variable length too, due to packing) we can't do random
- * access without building up additional in memory tables in the btree node read
- * path.
- *
- * BTREE KEYS (struct bkey):
- *
- * The various btrees share a common format for the key - so as to avoid
- * switching in fastpath lookup/comparison code - but define their own
- * structures for the key values.
- *
- * The size of a key/value pair is stored as a u8 in units of u64s, so the max
- * size is just under 2k. The common part also contains a type tag for the
- * value, and a format field indicating whether the key is packed or not (and
- * also meant to allow adding new key fields in the future, if desired).
- *
- * bkeys, when stored within a btree node, may also be packed. In that case, the
- * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
- * be generous with field sizes in the common part of the key format (64 bit
- * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
- */
-
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <uapi/linux/magic.h>
-#include "vstructs.h"
-
-#ifdef __KERNEL__
-typedef uuid_t __uuid_t;
-#endif
-
-#define BITMASK(name, type, field, offset, end) \
-static const __maybe_unused unsigned name##_OFFSET = offset; \
-static const __maybe_unused unsigned name##_BITS = (end - offset); \
- \
-static inline __u64 name(const type *k) \
-{ \
- return (k->field >> offset) & ~(~0ULL << (end - offset)); \
-} \
- \
-static inline void SET_##name(type *k, __u64 v) \
-{ \
- k->field &= ~(~(~0ULL << (end - offset)) << offset); \
- k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
-}
-
-#define LE_BITMASK(_bits, name, type, field, offset, end) \
-static const __maybe_unused unsigned name##_OFFSET = offset; \
-static const __maybe_unused unsigned name##_BITS = (end - offset); \
-static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
- \
-static inline __u64 name(const type *k) \
-{ \
- return (__le##_bits##_to_cpu(k->field) >> offset) & \
- ~(~0ULL << (end - offset)); \
-} \
- \
-static inline void SET_##name(type *k, __u64 v) \
-{ \
- __u##_bits new = __le##_bits##_to_cpu(k->field); \
- \
- new &= ~(~(~0ULL << (end - offset)) << offset); \
- new |= (v & ~(~0ULL << (end - offset))) << offset; \
- k->field = __cpu_to_le##_bits(new); \
-}
-
-#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
-#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
-#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
-
-struct bkey_format {
- __u8 key_u64s;
- __u8 nr_fields;
- /* One unused slot for now: */
- __u8 bits_per_field[6];
- __le64 field_offset[6];
-};
-
-/* Btree keys - all units are in sectors */
-
-struct bpos {
- /*
- * Word order matches machine byte order - btree code treats a bpos as a
- * single large integer, for search/comparison purposes
- *
- * Note that wherever a bpos is embedded in another on disk data
- * structure, it has to be byte swabbed when reading in metadata that
- * wasn't written in native endian order:
- */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- __u32 snapshot;
- __u64 offset;
- __u64 inode;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- __u64 inode;
- __u64 offset; /* Points to end of extent - sectors */
- __u32 snapshot;
-#else
-#error edit for your odd byteorder.
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-#define KEY_INODE_MAX ((__u64)~0ULL)
-#define KEY_OFFSET_MAX ((__u64)~0ULL)
-#define KEY_SNAPSHOT_MAX ((__u32)~0U)
-#define KEY_SIZE_MAX ((__u32)~0U)
-
-static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
-{
- return (struct bpos) {
- .inode = inode,
- .offset = offset,
- .snapshot = snapshot,
- };
-}
-
-#define POS_MIN SPOS(0, 0, 0)
-#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
-#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
-#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
-
-/* Empty placeholder struct, for container_of() */
-struct bch_val {
- __u64 __nothing[0];
-};
-
-struct bversion {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- __u64 lo;
- __u32 hi;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- __u32 hi;
- __u64 lo;
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-__aligned(4)
-#endif
-;
-
-struct bkey {
- /* Size of combined key and value, in u64s */
- __u8 u64s;
-
- /* Format of key (0 for format local to btree node) */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 format:7,
- needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u8 needs_whiteout:1,
- format:7;
-#else
-#error edit for your odd byteorder.
-#endif
-
- /* Type of the value */
- __u8 type;
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- __u8 pad[1];
-
- struct bversion bversion;
- __u32 size; /* extent size, in sectors */
- struct bpos p;
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- struct bpos p;
- __u32 size; /* extent size, in sectors */
- struct bversion bversion;
-
- __u8 pad[1];
-#endif
-} __packed
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-/*
- * The big-endian version of bkey can't be compiled by rustc with the "aligned"
- * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
- * So for Rust compatibility, don't include this. It can be included in the LE
- * version because the "packed" attr is redundant in that case.
- *
- * History: (quoting Kent)
- *
- * Specifically, when i was designing bkey, I wanted the header to be no
- * bigger than necessary so that bkey_packed could use the rest. That means that
- * decently offten extent keys will fit into only 8 bytes, instead of spilling over
- * to 16.
- *
- * But packed_bkey treats the part after the header - the packed section -
- * as a single multi word, variable length integer. And bkey, the unpacked
- * version, is just a special case version of a bkey_packed; all the packed
- * bkey code will work on keys in any packed format, the in-memory
- * representation of an unpacked key also is just one type of packed key...
- *
- * So that constrains the key part of a bkig endian bkey to start right
- * after the header.
- *
- * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
- * some reason - that will clean up this wart.
- */
-__aligned(8)
-#endif
-;
-
-struct bkey_packed {
- __u64 _data[0];
-
- /* Size of combined key and value, in u64s */
- __u8 u64s;
-
- /* Format of key (0 for format local to btree node) */
-
- /*
- * XXX: next incompat on disk format change, switch format and
- * needs_whiteout - bkey_packed() will be cheaper if format is the high
- * bits of the bitfield
- */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 format:7,
- needs_whiteout:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u8 needs_whiteout:1,
- format:7;
-#endif
-
- /* Type of the value */
- __u8 type;
- __u8 key_start[0];
-
- /*
- * We copy bkeys with struct assignment in various places, and while
- * that shouldn't be done with packed bkeys we can't disallow it in C,
- * and it's legal to cast a bkey to a bkey_packed - so padding it out
- * to the same size as struct bkey should hopefully be safest.
- */
- __u8 pad[sizeof(struct bkey) - 3];
-} __packed __aligned(8);
-
-typedef struct {
- __le64 lo;
- __le64 hi;
-} bch_le128;
-
-#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
-#define BKEY_U64s_MAX U8_MAX
-#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
-
-#define KEY_PACKED_BITS_START 24
-
-#define KEY_FORMAT_LOCAL_BTREE 0
-#define KEY_FORMAT_CURRENT 1
-
-enum bch_bkey_fields {
- BKEY_FIELD_INODE,
- BKEY_FIELD_OFFSET,
- BKEY_FIELD_SNAPSHOT,
- BKEY_FIELD_SIZE,
- BKEY_FIELD_VERSION_HI,
- BKEY_FIELD_VERSION_LO,
- BKEY_NR_FIELDS,
-};
-
-#define bkey_format_field(name, field) \
- [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
-
-#define BKEY_FORMAT_CURRENT \
-((struct bkey_format) { \
- .key_u64s = BKEY_U64s, \
- .nr_fields = BKEY_NR_FIELDS, \
- .bits_per_field = { \
- bkey_format_field(INODE, p.inode), \
- bkey_format_field(OFFSET, p.offset), \
- bkey_format_field(SNAPSHOT, p.snapshot), \
- bkey_format_field(SIZE, size), \
- bkey_format_field(VERSION_HI, bversion.hi), \
- bkey_format_field(VERSION_LO, bversion.lo), \
- }, \
-})
-
-/* bkey with inline value */
-struct bkey_i {
- __u64 _data[0];
-
- struct bkey k;
- struct bch_val v;
-};
-
-#define POS_KEY(_pos) \
-((struct bkey) { \
- .u64s = BKEY_U64s, \
- .format = KEY_FORMAT_CURRENT, \
- .p = _pos, \
-})
-
-#define KEY(_inode, _offset, _size) \
-((struct bkey) { \
- .u64s = BKEY_U64s, \
- .format = KEY_FORMAT_CURRENT, \
- .p = POS(_inode, _offset), \
- .size = _size, \
-})
-
-static inline void bkey_init(struct bkey *k)
-{
- *k = KEY(0, 0, 0);
-}
-
-#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
-
-#define __BKEY_PADDED(key, pad) \
- struct bkey_i key; __u64 key ## _pad[pad]
-
-enum bch_bkey_type_flags {
- BKEY_TYPE_strict_btree_checks = BIT(0),
-};
-
-/*
- * - DELETED keys are used internally to mark keys that should be ignored but
- * override keys in composition order. Their version number is ignored.
- *
- * - DISCARDED keys indicate that the data is all 0s because it has been
- * discarded. DISCARDs may have a version; if the version is nonzero the key
- * will be persistent, otherwise the key will be dropped whenever the btree
- * node is rewritten (like DELETED keys).
- *
- * - ERROR: any read of the data returns a read error, as the data was lost due
- * to a failing device. Like DISCARDED keys, they can be removed (overridden)
- * by new writes or cluster-wide GC. Node repair can also overwrite them with
- * the same or a more recent version number, but not with an older version
- * number.
- *
- * - WHITEOUT: for hash table btrees
- */
-#define BCH_BKEY_TYPES() \
- x(deleted, 0, 0) \
- x(whiteout, 1, 0) \
- x(error, 2, 0) \
- x(cookie, 3, 0) \
- x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
- x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
- x(extent, 6, BKEY_TYPE_strict_btree_checks) \
- x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
- x(inode, 8, BKEY_TYPE_strict_btree_checks) \
- x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
- x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
- x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
- x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
- x(quota, 13, BKEY_TYPE_strict_btree_checks) \
- x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
- x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
- x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
- x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
- x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
- x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
- x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
- x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
- x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
- x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
- x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
- x(set, 25, 0) \
- x(lru, 26, BKEY_TYPE_strict_btree_checks) \
- x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
- x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
- x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
- x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
- x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
- x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
- x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
- x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
- x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
-
-enum bch_bkey_type {
-#define x(name, nr, ...) KEY_TYPE_##name = nr,
- BCH_BKEY_TYPES()
-#undef x
- KEY_TYPE_MAX,
-};
-
-struct bch_deleted {
- struct bch_val v;
-};
-
-struct bch_whiteout {
- struct bch_val v;
-};
-
-struct bch_error {
- struct bch_val v;
-};
-
-struct bch_cookie {
- struct bch_val v;
- __le64 cookie;
-};
-
-struct bch_hash_whiteout {
- struct bch_val v;
-};
-
-struct bch_set {
- struct bch_val v;
-};
-
-/* 128 bits, sufficient for cryptographic MACs: */
-struct bch_csum {
- __le64 lo;
- __le64 hi;
-} __packed __aligned(8);
-
-struct bch_backpointer {
- struct bch_val v;
- __u8 btree_id;
- __u8 level;
- __u8 data_type;
- __u8 bucket_gen;
- __u32 pad;
- __u32 bucket_len;
- struct bpos pos;
-} __packed __aligned(8);
-
-/* Optional/variable size superblock sections: */
-
-struct bch_sb_field {
- __u64 _data[0];
- __le32 u64s;
- __le32 type;
-};
-
-#define BCH_SB_FIELDS() \
- x(journal, 0) \
- x(members_v1, 1) \
- x(crypt, 2) \
- x(replicas_v0, 3) \
- x(quota, 4) \
- x(disk_groups, 5) \
- x(clean, 6) \
- x(replicas, 7) \
- x(journal_seq_blacklist, 8) \
- x(journal_v2, 9) \
- x(counters, 10) \
- x(members_v2, 11) \
- x(errors, 12) \
- x(ext, 13) \
- x(downgrade, 14) \
- x(recovery_passes, 15)
-
-#include "alloc_background_format.h"
-#include "dirent_format.h"
-#include "disk_accounting_format.h"
-#include "disk_groups_format.h"
-#include "extents_format.h"
-#include "ec_format.h"
-#include "inode_format.h"
-#include "journal_seq_blacklist_format.h"
-#include "logged_ops_format.h"
-#include "lru_format.h"
-#include "quota_format.h"
-#include "recovery_passes_format.h"
-#include "reflink_format.h"
-#include "replicas_format.h"
-#include "snapshot_format.h"
-#include "subvolume_format.h"
-#include "sb-counters_format.h"
-#include "sb-downgrade_format.h"
-#include "sb-errors_format.h"
-#include "sb-members_format.h"
-#include "xattr_format.h"
-
-enum bch_sb_field_type {
-#define x(f, nr) BCH_SB_FIELD_##f = nr,
- BCH_SB_FIELDS()
-#undef x
- BCH_SB_FIELD_NR
-};
-
-/*
- * Most superblock fields are replicated in all device's superblocks - a few are
- * not:
- */
-#define BCH_SINGLE_DEVICE_SB_FIELDS \
- ((1U << BCH_SB_FIELD_journal)| \
- (1U << BCH_SB_FIELD_journal_v2))
-
-/* BCH_SB_FIELD_journal: */
-
-struct bch_sb_field_journal {
- struct bch_sb_field field;
- __le64 buckets[];
-};
-
-struct bch_sb_field_journal_v2 {
- struct bch_sb_field field;
-
- struct bch_sb_field_journal_v2_entry {
- __le64 start;
- __le64 nr;
- } d[];
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-struct nonce {
- __le32 d[4];
-};
-
-struct bch_key {
- __le64 key[4];
-};
-
-#define BCH_KEY_MAGIC \
- (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \
- ((__u64) 'h' << 16)|((__u64) '*' << 24)| \
- ((__u64) '*' << 32)|((__u64) 'k' << 40)| \
- ((__u64) 'e' << 48)|((__u64) 'y' << 56))
-
-struct bch_encrypted_key {
- __le64 magic;
- struct bch_key key;
-};
-
-/*
- * If this field is present in the superblock, it stores an encryption key which
- * is used encrypt all other data/metadata. The key will normally be encrypted
- * with the key userspace provides, but if encryption has been turned off we'll
- * just store the master key unencrypted in the superblock so we can access the
- * previously encrypted data.
- */
-struct bch_sb_field_crypt {
- struct bch_sb_field field;
-
- __le64 flags;
- __le64 kdf_flags;
- struct bch_encrypted_key key;
-};
-
-LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4);
-
-enum bch_kdf_types {
- BCH_KDF_SCRYPT = 0,
- BCH_KDF_NR = 1,
-};
-
-/* stored as base 2 log of scrypt params: */
-LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
-LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
-LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-
-/*
- * On clean shutdown, store btree roots and current journal sequence number in
- * the superblock:
- */
-struct jset_entry {
- __le16 u64s;
- __u8 btree_id;
- __u8 level;
- __u8 type; /* designates what this jset holds */
- __u8 pad[3];
-
- struct bkey_i start[0];
- __u64 _data[];
-};
-
-struct bch_sb_field_clean {
- struct bch_sb_field field;
-
- __le32 flags;
- __le16 _read_clock; /* no longer used */
- __le16 _write_clock;
- __le64 journal_seq;
-
- struct jset_entry start[0];
- __u64 _data[];
-};
-
-struct bch_sb_field_ext {
- struct bch_sb_field field;
- __le64 recovery_passes_required[2];
- __le64 errors_silent[8];
- __le64 btrees_lost_data;
-};
-
-/* Superblock: */
-
-/*
- * New versioning scheme:
- * One common version number for all on disk data structures - superblock, btree
- * nodes, journal entries
- */
-#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10))
-#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
-#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
-
-/*
- * field 1: version name
- * field 2: BCH_VERSION(major, minor)
- * field 3: recovery passess required on upgrade
- */
-#define BCH_METADATA_VERSIONS() \
- x(bkey_renumber, BCH_VERSION(0, 10)) \
- x(inode_btree_change, BCH_VERSION(0, 11)) \
- x(snapshot, BCH_VERSION(0, 12)) \
- x(inode_backpointers, BCH_VERSION(0, 13)) \
- x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
- x(snapshot_2, BCH_VERSION(0, 15)) \
- x(reflink_p_fix, BCH_VERSION(0, 16)) \
- x(subvol_dirent, BCH_VERSION(0, 17)) \
- x(inode_v2, BCH_VERSION(0, 18)) \
- x(freespace, BCH_VERSION(0, 19)) \
- x(alloc_v4, BCH_VERSION(0, 20)) \
- x(new_data_types, BCH_VERSION(0, 21)) \
- x(backpointers, BCH_VERSION(0, 22)) \
- x(inode_v3, BCH_VERSION(0, 23)) \
- x(unwritten_extents, BCH_VERSION(0, 24)) \
- x(bucket_gens, BCH_VERSION(0, 25)) \
- x(lru_v2, BCH_VERSION(0, 26)) \
- x(fragmentation_lru, BCH_VERSION(0, 27)) \
- x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
- x(snapshot_trees, BCH_VERSION(0, 29)) \
- x(major_minor, BCH_VERSION(1, 0)) \
- x(snapshot_skiplists, BCH_VERSION(1, 1)) \
- x(deleted_inodes, BCH_VERSION(1, 2)) \
- x(rebalance_work, BCH_VERSION(1, 3)) \
- x(member_seq, BCH_VERSION(1, 4)) \
- x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
- x(btree_subvolume_children, BCH_VERSION(1, 6)) \
- x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
- x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
- x(disk_accounting_v2, BCH_VERSION(1, 9)) \
- x(disk_accounting_v3, BCH_VERSION(1, 10)) \
- x(disk_accounting_inum, BCH_VERSION(1, 11)) \
- x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
- x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \
- x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
- x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
- x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
- x(inode_depth, BCH_VERSION(1, 17)) \
- x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
- x(autofix_errors, BCH_VERSION(1, 19)) \
- x(directory_size, BCH_VERSION(1, 20)) \
- x(cached_backpointers, BCH_VERSION(1, 21)) \
- x(stripe_backpointers, BCH_VERSION(1, 22)) \
- x(stripe_lru, BCH_VERSION(1, 23)) \
- x(casefolding, BCH_VERSION(1, 24)) \
- x(extent_flags, BCH_VERSION(1, 25)) \
- x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
- x(fast_device_removal, BCH_VERSION(1, 27)) \
- x(inode_has_case_insensitive, BCH_VERSION(1, 28))
-
-enum bcachefs_metadata_version {
- bcachefs_metadata_version_min = 9,
-#define x(t, n) bcachefs_metadata_version_##t = n,
- BCH_METADATA_VERSIONS()
-#undef x
- bcachefs_metadata_version_max
-};
-
-static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
-
-#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
-
-#define BCH_SB_SECTOR 8
-
-#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
-
-struct bch_sb_layout {
- __uuid_t magic; /* bcachefs superblock UUID */
- __u8 layout_type;
- __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
- __u8 nr_superblocks;
- __u8 pad[5];
- __le64 sb_offset[61];
-} __packed __aligned(8);
-
-#define BCH_SB_LAYOUT_SECTOR 7
-
-/*
- * @offset - sector where this sb was written
- * @version - on disk format version
- * @version_min - Oldest metadata version this filesystem contains; so we can
- * safely drop compatibility code and refuse to mount filesystems
- * we'd need it for
- * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
- * @seq - incremented each time superblock is written
- * @uuid - used for generating various magic numbers and identifying
- * member devices, never changes
- * @user_uuid - user visible UUID, may be changed
- * @label - filesystem label
- * @seq - identifies most recent superblock, incremented each time
- * superblock is written
- * @features - enabled incompatible features
- */
-struct bch_sb {
- struct bch_csum csum;
- __le16 version;
- __le16 version_min;
- __le16 pad[2];
- __uuid_t magic;
- __uuid_t uuid;
- __uuid_t user_uuid;
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 offset;
- __le64 seq;
-
- __le16 block_size;
- __u8 dev_idx;
- __u8 nr_devices;
- __le32 u64s;
-
- __le64 time_base_lo;
- __le32 time_base_hi;
- __le32 time_precision;
-
- __le64 flags[7];
- __le64 write_time;
- __le64 features[2];
- __le64 compat[2];
-
- struct bch_sb_layout layout;
-
- struct bch_sb_field start[0];
- __le64 _data[];
-} __packed __aligned(8);
-
-/*
- * Flags:
- * BCH_SB_INITALIZED - set on first mount
- * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect
- * behaviour of mount/recovery path:
- * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits
- * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
- * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
- * DATA/META_CSUM_TYPE. Also indicates encryption
- * algorithm in use, if/when we get more than one
- */
-
-LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16);
-
-LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1);
-LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2);
-LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8);
-LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12);
-
-LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28);
-
-LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33);
-LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40);
-
-LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44);
-LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
-
-LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57);
-LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58);
-LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59);
-LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60);
-
-LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
-LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
-
-LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
-LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
- struct bch_sb, flags[0], 63, 64);
-
-LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
-LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
-
-LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
-LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
-
-/*
- * Max size of an extent that may require bouncing to read or write
- * (checksummed, compressed): 64k
- */
-LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
- struct bch_sb, flags[1], 14, 20);
-
-LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
-LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
-
-LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
-LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
-LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
-
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
- struct bch_sb, flags[2], 0, 4);
-LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
-
-LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
-LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
-LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
-LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
-LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
-LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64);
-LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
-LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
-LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
-LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56);
-
-LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
- struct bch_sb, flags[4], 60, 64);
-
-LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
- struct bch_sb, flags[5], 0, 16);
-LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
- struct bch_sb, flags[5], 16, 32);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
-LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
- struct bch_sb, flags[5], 48, 64);
-LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
-LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
-LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
-LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22);
-LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23);
-LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24);
-
-static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
- return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
- SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
- SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
-{
- return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
- (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
-}
-
-static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
-{
- SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
- SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
-}
-
-/*
- * Features:
- *
- * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
- * reflink: gates KEY_TYPE_reflink
- * inline_data: gates KEY_TYPE_inline_data
- * new_siphash: gates BCH_STR_HASH_siphash
- * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
- */
-#define BCH_SB_FEATURES() \
- x(lz4, 0) \
- x(gzip, 1) \
- x(zstd, 2) \
- x(atomic_nlink, 3) \
- x(ec, 4) \
- x(journal_seq_blacklist_v3, 5) \
- x(reflink, 6) \
- x(new_siphash, 7) \
- x(inline_data, 8) \
- x(new_extent_overwrite, 9) \
- x(incompressible, 10) \
- x(btree_ptr_v2, 11) \
- x(extents_above_btree_updates, 12) \
- x(btree_updates_journalled, 13) \
- x(reflink_inline_data, 14) \
- x(new_varint, 15) \
- x(journal_no_flush, 16) \
- x(alloc_v2, 17) \
- x(extents_across_btree_nodes, 18) \
- x(incompat_version_field, 19) \
- x(casefolding, 20) \
- x(no_alloc_info, 21) \
- x(small_image, 22)
-
-#define BCH_SB_FEATURES_ALWAYS \
- (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
- BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
- BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
- BIT_ULL(BCH_FEATURE_alloc_v2)|\
- BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
-
-#define BCH_SB_FEATURES_ALL \
- (BCH_SB_FEATURES_ALWAYS| \
- BIT_ULL(BCH_FEATURE_new_siphash)| \
- BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
- BIT_ULL(BCH_FEATURE_new_varint)| \
- BIT_ULL(BCH_FEATURE_journal_no_flush)| \
- BIT_ULL(BCH_FEATURE_incompat_version_field))
-
-enum bch_sb_feature {
-#define x(f, n) BCH_FEATURE_##f,
- BCH_SB_FEATURES()
-#undef x
- BCH_FEATURE_NR,
-};
-
-#define BCH_SB_COMPAT() \
- x(alloc_info, 0) \
- x(alloc_metadata, 1) \
- x(extents_above_btree_updates_done, 2) \
- x(bformat_overflow_done, 3)
-
-enum bch_sb_compat {
-#define x(f, n) BCH_COMPAT_##f,
- BCH_SB_COMPAT()
-#undef x
- BCH_COMPAT_NR,
-};
-
-/* options: */
-
-#define BCH_VERSION_UPGRADE_OPTS() \
- x(compatible, 0) \
- x(incompatible, 1) \
- x(none, 2)
-
-enum bch_version_upgrade_opts {
-#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
- BCH_VERSION_UPGRADE_OPTS()
-#undef x
-};
-
-#define BCH_REPLICAS_MAX 4U
-
-#define BCH_BKEY_PTRS_MAX 16U
-
-#define BCH_ERROR_ACTIONS() \
- x(continue, 0) \
- x(fix_safe, 1) \
- x(panic, 2) \
- x(ro, 3)
-
-enum bch_error_actions {
-#define x(t, n) BCH_ON_ERROR_##t = n,
- BCH_ERROR_ACTIONS()
-#undef x
- BCH_ON_ERROR_NR
-};
-
-#define BCH_DEGRADED_ACTIONS() \
- x(ask, 0) \
- x(yes, 1) \
- x(very, 2) \
- x(no, 3)
-
-enum bch_degraded_actions {
-#define x(t, n) BCH_DEGRADED_##t = n,
- BCH_DEGRADED_ACTIONS()
-#undef x
- BCH_DEGRADED_ACTIONS_NR
-};
-
-#define BCH_STR_HASH_TYPES() \
- x(crc32c, 0) \
- x(crc64, 1) \
- x(siphash_old, 2) \
- x(siphash, 3)
-
-enum bch_str_hash_type {
-#define x(t, n) BCH_STR_HASH_##t = n,
- BCH_STR_HASH_TYPES()
-#undef x
- BCH_STR_HASH_NR
-};
-
-#define BCH_STR_HASH_OPTS() \
- x(crc32c, 0) \
- x(crc64, 1) \
- x(siphash, 2)
-
-enum bch_str_hash_opts {
-#define x(t, n) BCH_STR_HASH_OPT_##t = n,
- BCH_STR_HASH_OPTS()
-#undef x
- BCH_STR_HASH_OPT_NR
-};
-
-#define BCH_CSUM_TYPES() \
- x(none, 0) \
- x(crc32c_nonzero, 1) \
- x(crc64_nonzero, 2) \
- x(chacha20_poly1305_80, 3) \
- x(chacha20_poly1305_128, 4) \
- x(crc32c, 5) \
- x(crc64, 6) \
- x(xxhash, 7)
-
-enum bch_csum_type {
-#define x(t, n) BCH_CSUM_##t = n,
- BCH_CSUM_TYPES()
-#undef x
- BCH_CSUM_NR
-};
-
-static const __maybe_unused unsigned bch_crc_bytes[] = {
- [BCH_CSUM_none] = 0,
- [BCH_CSUM_crc32c_nonzero] = 4,
- [BCH_CSUM_crc32c] = 4,
- [BCH_CSUM_crc64_nonzero] = 8,
- [BCH_CSUM_crc64] = 8,
- [BCH_CSUM_xxhash] = 8,
- [BCH_CSUM_chacha20_poly1305_80] = 10,
- [BCH_CSUM_chacha20_poly1305_128] = 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
- switch (type) {
- case BCH_CSUM_chacha20_poly1305_80:
- case BCH_CSUM_chacha20_poly1305_128:
- return true;
- default:
- return false;
- }
-}
-
-#define BCH_CSUM_OPTS() \
- x(none, 0) \
- x(crc32c, 1) \
- x(crc64, 2) \
- x(xxhash, 3)
-
-enum bch_csum_opt {
-#define x(t, n) BCH_CSUM_OPT_##t = n,
- BCH_CSUM_OPTS()
-#undef x
- BCH_CSUM_OPT_NR
-};
-
-#define BCH_COMPRESSION_TYPES() \
- x(none, 0) \
- x(lz4_old, 1) \
- x(gzip, 2) \
- x(lz4, 3) \
- x(zstd, 4) \
- x(incompressible, 5)
-
-enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
- BCH_COMPRESSION_TYPES()
-#undef x
- BCH_COMPRESSION_TYPE_NR
-};
-
-#define BCH_COMPRESSION_OPTS() \
- x(none, 0) \
- x(lz4, 1) \
- x(gzip, 2) \
- x(zstd, 3)
-
-enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
- BCH_COMPRESSION_OPTS()
-#undef x
- BCH_COMPRESSION_OPT_NR
-};
-
-/*
- * Magic numbers
- *
- * The various other data structures have their own magic numbers, which are
- * xored with the first part of the cache set's UUID
- */
-
-#define BCACHE_MAGIC \
- UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \
- 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
-#define BCHFS_MAGIC \
- UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
- 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCACHEFS_STATFS_MAGIC BCACHEFS_SUPER_MAGIC
-
-#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
-#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
-
-static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
-{
- __le64 ret;
-
- memcpy(&ret, &sb->uuid, sizeof(ret));
- return ret;
-}
-
-static inline __u64 __jset_magic(struct bch_sb *sb)
-{
- return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
-}
-
-static inline __u64 __bset_magic(struct bch_sb *sb)
-{
- return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
-}
-
-/* Journal */
-
-#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
-
-#define BCH_JSET_ENTRY_TYPES() \
- x(btree_keys, 0) \
- x(btree_root, 1) \
- x(prio_ptrs, 2) \
- x(blacklist, 3) \
- x(blacklist_v2, 4) \
- x(usage, 5) \
- x(data_usage, 6) \
- x(clock, 7) \
- x(dev_usage, 8) \
- x(log, 9) \
- x(overwrite, 10) \
- x(write_buffer_keys, 11) \
- x(datetime, 12) \
- x(log_bkey, 13)
-
-enum bch_jset_entry_type {
-#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
- BCH_JSET_ENTRY_TYPES()
-#undef x
- BCH_JSET_ENTRY_NR
-};
-
-static inline bool jset_entry_is_key(struct jset_entry *e)
-{
- switch (e->type) {
- case BCH_JSET_ENTRY_btree_keys:
- case BCH_JSET_ENTRY_btree_root:
- case BCH_JSET_ENTRY_write_buffer_keys:
- return true;
- }
-
- return false;
-}
-
-/*
- * Journal sequence numbers can be blacklisted: bsets record the max sequence
- * number of all the journal entries they contain updates for, so that on
- * recovery we can ignore those bsets that contain index updates newer that what
- * made it into the journal.
- *
- * This means that we can't reuse that journal_seq - we have to skip it, and
- * then record that we skipped it so that the next time we crash and recover we
- * don't think there was a missing journal entry.
- */
-struct jset_entry_blacklist {
- struct jset_entry entry;
- __le64 seq;
-};
-
-struct jset_entry_blacklist_v2 {
- struct jset_entry entry;
- __le64 start;
- __le64 end;
-};
-
-#define BCH_FS_USAGE_TYPES() \
- x(reserved, 0) \
- x(inodes, 1) \
- x(key_version, 2)
-
-enum bch_fs_usage_type {
-#define x(f, nr) BCH_FS_USAGE_##f = nr,
- BCH_FS_USAGE_TYPES()
-#undef x
- BCH_FS_USAGE_NR
-};
-
-struct jset_entry_usage {
- struct jset_entry entry;
- __le64 v;
-} __packed;
-
-struct jset_entry_data_usage {
- struct jset_entry entry;
- __le64 v;
- struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct jset_entry_clock {
- struct jset_entry entry;
- __u8 rw;
- __u8 pad[7];
- __le64 time;
-} __packed;
-
-struct jset_entry_dev_usage_type {
- __le64 buckets;
- __le64 sectors;
- __le64 fragmented;
-} __packed;
-
-struct jset_entry_dev_usage {
- struct jset_entry entry;
- __le32 dev;
- __u32 pad;
-
- __le64 _buckets_ec; /* No longer used */
- __le64 _buckets_unavailable; /* No longer used */
-
- struct jset_entry_dev_usage_type d[];
-};
-
-static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
-{
- return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
- sizeof(struct jset_entry_dev_usage_type);
-}
-
-struct jset_entry_log {
- struct jset_entry entry;
- u8 d[];
-} __packed __aligned(8);
-
-static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
-{
- unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
-
- while (b && !l->d[b - 1])
- --b;
- return b;
-}
-
-struct jset_entry_datetime {
- struct jset_entry entry;
- __le64 seconds;
-} __packed __aligned(8);
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
- struct bch_csum csum;
-
- __le64 magic;
- __le64 seq;
- __le32 version;
- __le32 flags;
-
- __le32 u64s; /* size of d[] in u64s */
-
- __u8 encrypted_start[0];
-
- __le16 _read_clock; /* no longer used */
- __le16 _write_clock;
-
- /* Sequence number of oldest dirty journal entry */
- __le64 last_seq;
-
-
- struct jset_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
-LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
-LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
-
-#define BCH_JOURNAL_BUCKETS_MIN 8
-
-/* Btree: */
-
-enum btree_id_flags {
- BTREE_IS_extents = BIT(0),
- BTREE_IS_snapshots = BIT(1),
- BTREE_IS_snapshot_field = BIT(2),
- BTREE_IS_data = BIT(3),
- BTREE_IS_write_buffer = BIT(4),
-};
-
-#define BCH_BTREE_IDS() \
- x(extents, 0, \
- BTREE_IS_extents| \
- BTREE_IS_snapshots| \
- BTREE_IS_data, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_error)| \
- BIT_ULL(KEY_TYPE_cookie)| \
- BIT_ULL(KEY_TYPE_extent)| \
- BIT_ULL(KEY_TYPE_reservation)| \
- BIT_ULL(KEY_TYPE_reflink_p)| \
- BIT_ULL(KEY_TYPE_inline_data)) \
- x(inodes, 1, \
- BTREE_IS_snapshots, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_inode)| \
- BIT_ULL(KEY_TYPE_inode_v2)| \
- BIT_ULL(KEY_TYPE_inode_v3)| \
- BIT_ULL(KEY_TYPE_inode_generation)) \
- x(dirents, 2, \
- BTREE_IS_snapshots, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_hash_whiteout)| \
- BIT_ULL(KEY_TYPE_dirent)) \
- x(xattrs, 3, \
- BTREE_IS_snapshots, \
- BIT_ULL(KEY_TYPE_whiteout)| \
- BIT_ULL(KEY_TYPE_cookie)| \
- BIT_ULL(KEY_TYPE_hash_whiteout)| \
- BIT_ULL(KEY_TYPE_xattr)) \
- x(alloc, 4, 0, \
- BIT_ULL(KEY_TYPE_alloc)| \
- BIT_ULL(KEY_TYPE_alloc_v2)| \
- BIT_ULL(KEY_TYPE_alloc_v3)| \
- BIT_ULL(KEY_TYPE_alloc_v4)) \
- x(quotas, 5, 0, \
- BIT_ULL(KEY_TYPE_quota)) \
- x(stripes, 6, 0, \
- BIT_ULL(KEY_TYPE_stripe)) \
- x(reflink, 7, \
- BTREE_IS_extents| \
- BTREE_IS_data, \
- BIT_ULL(KEY_TYPE_reflink_v)| \
- BIT_ULL(KEY_TYPE_indirect_inline_data)| \
- BIT_ULL(KEY_TYPE_error)) \
- x(subvolumes, 8, 0, \
- BIT_ULL(KEY_TYPE_subvolume)) \
- x(snapshots, 9, 0, \
- BIT_ULL(KEY_TYPE_snapshot)) \
- x(lru, 10, \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_set)) \
- x(freespace, 11, \
- BTREE_IS_extents, \
- BIT_ULL(KEY_TYPE_set)) \
- x(need_discard, 12, 0, \
- BIT_ULL(KEY_TYPE_set)) \
- x(backpointers, 13, \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_backpointer)) \
- x(bucket_gens, 14, 0, \
- BIT_ULL(KEY_TYPE_bucket_gens)) \
- x(snapshot_trees, 15, 0, \
- BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, \
- BTREE_IS_snapshot_field| \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_set)) \
- x(logged_ops, 17, 0, \
- BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert)| \
- BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
- x(rebalance_work, 18, \
- BTREE_IS_snapshot_field| \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
- x(subvolume_children, 19, 0, \
- BIT_ULL(KEY_TYPE_set)) \
- x(accounting, 20, \
- BTREE_IS_snapshot_field| \
- BTREE_IS_write_buffer, \
- BIT_ULL(KEY_TYPE_accounting)) \
-
-enum btree_id {
-#define x(name, nr, ...) BTREE_ID_##name = nr,
- BCH_BTREE_IDS()
-#undef x
- BTREE_ID_NR
-};
-
-/*
- * Maximum number of btrees that we will _ever_ have under the current scheme,
- * where we refer to them with 64 bit bitfields - and we also need a bit for
- * the interior btree node type:
- */
-#define BTREE_ID_NR_MAX 63
-
-static inline bool btree_id_is_alloc(enum btree_id id)
-{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- case BTREE_ID_lru:
- case BTREE_ID_accounting:
- return true;
- default:
- return false;
- }
-}
-
-#define BTREE_MAX_DEPTH 4U
-
-/* Btree nodes */
-
-/*
- * Btree nodes
- *
- * On disk a btree node is a list/log of these; within each set the keys are
- * sorted
- */
-struct bset {
- __le64 seq;
-
- /*
- * Highest journal entry this bset contains keys for.
- * If on recovery we don't see that journal entry, this bset is ignored:
- * this allows us to preserve the order of all index updates after a
- * crash, since the journal records a total order of all index updates
- * and anything that didn't make it to the journal doesn't get used.
- */
- __le64 journal_seq;
-
- __le32 flags;
- __le16 version;
- __le16 u64s; /* count of d[] in u64s */
-
- struct bkey_packed start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
-
-LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5);
-LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
- struct bset, flags, 5, 6);
-
-/* Sector offset within the btree node: */
-LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32);
-
-struct btree_node {
- struct bch_csum csum;
- __le64 magic;
-
- /* this flags field is encrypted, unlike bset->flags: */
- __le64 flags;
-
- /* Closed interval: */
- struct bpos min_key;
- struct bpos max_key;
- struct bch_extent_ptr _ptr; /* not used anymore */
- struct bkey_format format;
-
- union {
- struct bset keys;
- struct {
- __u8 pad[22];
- __le16 u64s;
- __u64 _data[0];
-
- };
- };
-} __packed __aligned(8);
-
-LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4);
-LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
-LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
- struct btree_node, flags, 8, 9);
-LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25);
-/* 25-32 unused */
-LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
-
-static inline __u64 BTREE_NODE_ID(struct btree_node *n)
-{
- return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
-}
-
-static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
-{
- SET_BTREE_NODE_ID_LO(n, v);
- SET_BTREE_NODE_ID_HI(n, v >> 4);
-}
-
-struct btree_node_entry {
- struct bch_csum csum;
-
- union {
- struct bset keys;
- struct {
- __u8 pad[22];
- __le16 u64s;
- __u64 _data[0];
- };
- };
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
deleted file mode 100644
index 52594e925eb7..000000000000
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IOCTL_H
-#define _BCACHEFS_IOCTL_H
-
-#include <linux/uuid.h>
-#include <asm/ioctl.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-
-/*
- * Flags common to multiple ioctls:
- */
-#define BCH_FORCE_IF_DATA_LOST (1 << 0)
-#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
-#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
-#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
-
-#define BCH_FORCE_IF_LOST \
- (BCH_FORCE_IF_DATA_LOST| \
- BCH_FORCE_IF_METADATA_LOST)
-#define BCH_FORCE_IF_DEGRADED \
- (BCH_FORCE_IF_DATA_DEGRADED| \
- BCH_FORCE_IF_METADATA_DEGRADED)
-
-/*
- * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
- * (e.g. /dev/sda1); if set, the dev field is the device's index within the
- * filesystem:
- */
-#define BCH_BY_INDEX (1 << 4)
-
-/*
- * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
- * wide superblock:
- */
-#define BCH_READ_DEV (1 << 5)
-
-/* global control dev: */
-
-/* These are currently broken, and probably unnecessary: */
-#if 0
-#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
-#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
-
-struct bch_ioctl_assemble {
- __u32 flags;
- __u32 nr_devs;
- __u64 pad;
- __u64 devs[];
-};
-
-struct bch_ioctl_incremental {
- __u32 flags;
- __u64 pad;
- __u64 dev;
-};
-#endif
-
-/* filesystem ioctls: */
-
-#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
-
-/* These only make sense when we also have incremental assembly */
-#if 0
-#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
-#define BCH_IOCTL_STOP _IO(0xbc, 3)
-#endif
-
-#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
-#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
-#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
-
-/* ioctl below act on a particular file, not the filesystem as a whole: */
-
-#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
-
-/*
- * BCH_IOCTL_QUERY_UUID: get filesystem UUID
- *
- * Returns user visible UUID, not internal UUID (which may not ever be changed);
- * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
- * this UUID.
- */
-struct bch_ioctl_query_uuid {
- __uuid_t uuid;
-};
-
-#if 0
-struct bch_ioctl_start {
- __u32 flags;
- __u32 pad;
-};
-#endif
-
-/*
- * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
- *
- * The specified device must not be open or in use. On success, the new device
- * will be an online member of the filesystem just like any other member.
- *
- * The device must first be prepared by userspace by formatting with a bcachefs
- * superblock, which is only used for passing in superblock options/parameters
- * for that device (in struct bch_member). The new device's superblock should
- * not claim to be a member of any existing filesystem - UUIDs on it will be
- * ignored.
- */
-
-/*
- * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
- *
- * Any data present on @dev will be permanently deleted, and @dev will be
- * removed from its slot in the filesystem's list of member devices. The device
- * may be either offline or offline.
- *
- * Will fail removing @dev would leave us with insufficient read write devices
- * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
- * set.
- */
-
-/*
- * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
- * but is not open (e.g. because we started in degraded mode), bring it online
- *
- * all existing data on @dev will be available once the device is online,
- * exactly as if @dev was present when the filesystem was first mounted
- */
-
-/*
- * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
- * block device, without removing it from the filesystem (so it can be brought
- * back online later)
- *
- * Data present on @dev will be unavailable while @dev is offline (unless
- * replicated), but will still be intact and untouched if @dev is brought back
- * online
- *
- * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
- * leave us with insufficient read write devices or degraded/unavailable data,
- * unless the approprate BCH_FORCE_IF_* flags are set.
- */
-
-struct bch_ioctl_disk {
- __u32 flags;
- __u32 pad;
- __u64 dev;
-};
-
-/*
- * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
- *
- * @new_state - one of the bch_member_state states (rw, ro, failed,
- * spare)
- *
- * Will refuse to change member state if we would then have insufficient devices
- * to write to, or if it would result in degraded data (when @new_state is
- * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
- */
-struct bch_ioctl_disk_set_state {
- __u32 flags;
- __u8 new_state;
- __u8 pad[3];
- __u64 dev;
-};
-
-#define BCH_DATA_OPS() \
- x(scrub, 0) \
- x(rereplicate, 1) \
- x(migrate, 2) \
- x(rewrite_old_nodes, 3) \
- x(drop_extra_replicas, 4)
-
-enum bch_data_ops {
-#define x(t, n) BCH_DATA_OP_##t = n,
- BCH_DATA_OPS()
-#undef x
- BCH_DATA_OP_NR
-};
-
-/*
- * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
- * scrub, rereplicate, migrate).
- *
- * This ioctl kicks off a job in the background, and returns a file descriptor.
- * Reading from the file descriptor returns a struct bch_ioctl_data_event,
- * indicating current progress, and closing the file descriptor will stop the
- * job. The file descriptor is O_CLOEXEC.
- */
-struct bch_ioctl_data {
- __u16 op;
- __u8 start_btree;
- __u8 end_btree;
- __u32 flags;
-
- struct bpos start_pos;
- struct bpos end_pos;
-
- union {
- struct {
- __u32 dev;
- __u32 data_types;
- } scrub;
- struct {
- __u32 dev;
- __u32 pad;
- } migrate;
- struct {
- __u64 pad[8];
- };
- };
-} __packed __aligned(8);
-
-enum bch_data_event {
- BCH_DATA_EVENT_PROGRESS = 0,
- /* XXX: add an event for reporting errors */
- BCH_DATA_EVENT_NR = 1,
-};
-
-enum data_progress_data_type_special {
- DATA_PROGRESS_DATA_TYPE_phys = 254,
- DATA_PROGRESS_DATA_TYPE_done = 255,
-};
-
-struct bch_ioctl_data_progress {
- __u8 data_type;
- __u8 btree_id;
- __u8 pad[2];
- struct bpos pos;
-
- __u64 sectors_done;
- __u64 sectors_total;
- __u64 sectors_error_corrected;
- __u64 sectors_error_uncorrected;
-} __packed __aligned(8);
-
-enum bch_ioctl_data_event_ret {
- BCH_IOCTL_DATA_EVENT_RET_done = 1,
- BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
-};
-
-struct bch_ioctl_data_event {
- __u8 type;
- __u8 ret;
- __u8 pad[6];
- union {
- struct bch_ioctl_data_progress p;
- __u64 pad2[15];
- };
-} __packed __aligned(8);
-
-struct bch_replicas_usage {
- __u64 sectors;
- struct bch_replicas_entry_v1 r;
-} __packed;
-
-static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
-{
- return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
-}
-
-static inline struct bch_replicas_usage *
-replicas_usage_next(struct bch_replicas_usage *u)
-{
- return (void *) u + replicas_usage_bytes(u);
-}
-
-/* Obsolete */
-/*
- * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_fs_usage {
- __u64 capacity;
- __u64 used;
- __u64 online_reserved;
- __u64 persistent_reserved[BCH_REPLICAS_MAX];
-
- __u32 replica_entries_bytes;
- __u32 pad;
-
- struct bch_replicas_usage replicas[];
-};
-
-/* Obsolete */
-/*
- * BCH_IOCTL_DEV_USAGE: query device disk space usage
- *
- * Returns disk space usage broken out by data type - both by buckets and
- * sectors.
- */
-struct bch_ioctl_dev_usage {
- __u64 dev;
- __u32 flags;
- __u8 state;
- __u8 pad[7];
-
- __u32 bucket_size;
- __u64 nr_buckets;
-
- __u64 buckets_ec;
-
- struct bch_ioctl_dev_usage_type {
- __u64 buckets;
- __u64 sectors;
- __u64 fragmented;
- } d[10];
-};
-
-/* Obsolete */
-struct bch_ioctl_dev_usage_v2 {
- __u64 dev;
- __u32 flags;
- __u8 state;
- __u8 nr_data_types;
- __u8 pad[6];
-
- __u32 bucket_size;
- __u64 nr_buckets;
-
- struct bch_ioctl_dev_usage_type d[];
-};
-
-/*
- * BCH_IOCTL_READ_SUPER: read filesystem superblock
- *
- * Equivalent to reading the superblock directly from the block device, except
- * avoids racing with the kernel writing the superblock or having to figure out
- * which block device to read
- *
- * @sb - buffer to read into
- * @size - size of userspace allocated buffer
- * @dev - device to read superblock for, if BCH_READ_DEV flag is
- * specified
- *
- * Returns -ERANGE if buffer provided is too small
- */
-struct bch_ioctl_read_super {
- __u32 flags;
- __u32 pad;
- __u64 dev;
- __u64 size;
- __u64 sb;
-};
-
-/*
- * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
- * determine if disk is a (online) member - if so, returns device's index
- *
- * Returns -ENOENT if not found
- */
-struct bch_ioctl_disk_get_idx {
- __u64 dev;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
- *
- * @dev - member to resize
- * @nbuckets - new number of buckets
- */
-struct bch_ioctl_disk_resize {
- __u32 flags;
- __u32 pad;
- __u64 dev;
- __u64 nbuckets;
-};
-
-/*
- * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
- *
- * @dev - member to resize
- * @nbuckets - new number of buckets
- */
-struct bch_ioctl_disk_resize_journal {
- __u32 flags;
- __u32 pad;
- __u64 dev;
- __u64 nbuckets;
-};
-
-struct bch_ioctl_subvolume {
- __u32 flags;
- __u32 dirfd;
- __u16 mode;
- __u16 pad[3];
- __u64 dst_ptr;
- __u64 src_ptr;
-};
-
-#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
-#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
-
-/*
- * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_offline {
- __u64 flags;
- __u64 opts; /* string */
- __u64 nr_devs;
- __u64 devs[] __counted_by(nr_devs);
-};
-
-/*
- * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
- * but with the kernel's implementation of fsck:
- */
-struct bch_ioctl_fsck_online {
- __u64 flags;
- __u64 opts; /* string */
-};
-
-/*
- * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
- *
- * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
- *
- * On success, @replica_entries_bytes will be changed to indicate the number of
- * bytes actually used.
- *
- * Returns -ERANGE if @replica_entries_bytes was too small
- */
-struct bch_ioctl_query_accounting {
- __u64 capacity;
- __u64 used;
- __u64 online_reserved;
-
- __u32 accounting_u64s; /* input parameter */
- __u32 accounting_types_mask; /* input parameter */
-
- struct bkey_i_accounting accounting[];
-};
-
-#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
-
-struct bch_ioctl_query_counters {
- __u16 nr;
- __u16 flags;
- __u32 pad;
- __u64 d[];
-};
-
-#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
deleted file mode 100644
index ee823c640642..000000000000
--- a/fs/bcachefs/bkey.c
+++ /dev/null
@@ -1,1112 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_cmp.h"
-#include "bkey_methods.h"
-#include "bset.h"
-#include "util.h"
-
-const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *out,
- const struct bkey_format *f,
- const struct bkey_packed *k)
-{
- const u64 *p = high_word(f, k);
- unsigned word_bits = 64 - high_bit_offset;
- unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
- u64 v = *p & (~0ULL >> high_bit_offset);
-
- if (!nr_key_bits) {
- prt_str(out, "(empty)");
- return;
- }
-
- while (1) {
- unsigned next_key_bits = nr_key_bits;
-
- if (nr_key_bits < 64) {
- v >>= 64 - nr_key_bits;
- next_key_bits = 0;
- } else {
- next_key_bits -= 64;
- }
-
- bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
-
- if (!next_key_bits)
- break;
-
- prt_char(out, ' ');
-
- p = next_word(p);
- v = *p;
- word_bits = 64;
- nr_key_bits = next_key_bits;
- }
-}
-
-static void __bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
-{
- struct bkey tmp;
-
- BUG_ON(bkeyp_val_u64s(format, packed) !=
- bkey_val_u64s(unpacked));
-
- BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
-
- tmp = __bch2_bkey_unpack_key(format, packed);
-
- if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
- format->key_u64s,
- format->bits_per_field[0],
- format->bits_per_field[1],
- format->bits_per_field[2],
- format->bits_per_field[3],
- format->bits_per_field[4]);
-
- prt_printf(&buf, "compiled unpack: ");
- bch2_bkey_to_text(&buf, unpacked);
- prt_newline(&buf);
-
- prt_printf(&buf, "c unpack: ");
- bch2_bkey_to_text(&buf, &tmp);
- prt_newline(&buf);
-
- prt_printf(&buf, "compiled unpack: ");
- bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
- (struct bkey_packed *) unpacked);
- prt_newline(&buf);
-
- prt_printf(&buf, "c unpack: ");
- bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
- (struct bkey_packed *) &tmp);
- prt_newline(&buf);
-
- panic("%s", buf.buf);
- }
-}
-
-static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
-{
- if (static_branch_unlikely(&bch2_debug_check_bkey_unpack))
- __bch2_bkey_pack_verify(packed, unpacked, format);
-}
-
-struct pack_state {
- const struct bkey_format *format;
- unsigned bits; /* bits remaining in current word */
- u64 w; /* current word */
- u64 *p; /* pointer to next word */
-};
-
-__always_inline
-static struct pack_state pack_state_init(const struct bkey_format *format,
- struct bkey_packed *k)
-{
- u64 *p = high_word(format, k);
-
- return (struct pack_state) {
- .format = format,
- .bits = 64 - high_bit_offset,
- .w = 0,
- .p = p,
- };
-}
-
-__always_inline
-static void pack_state_finish(struct pack_state *state,
- struct bkey_packed *k)
-{
- EBUG_ON(state->p < k->_data);
- EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
-
- *state->p = state->w;
-}
-
-struct unpack_state {
- const struct bkey_format *format;
- unsigned bits; /* bits remaining in current word */
- u64 w; /* current word */
- const u64 *p; /* pointer to next word */
-};
-
-__always_inline
-static struct unpack_state unpack_state_init(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- const u64 *p = high_word(format, k);
-
- return (struct unpack_state) {
- .format = format,
- .bits = 64 - high_bit_offset,
- .w = *p << high_bit_offset,
- .p = p,
- };
-}
-
-__always_inline
-static u64 get_inc_field(struct unpack_state *state, unsigned field)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
-
- if (bits >= state->bits) {
- v = state->w >> (64 - bits);
- bits -= state->bits;
-
- state->p = next_word(state->p);
- state->w = *state->p;
- state->bits = 64;
- }
-
- /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
- v |= (state->w >> 1) >> (63 - bits);
- state->w <<= bits;
- state->bits -= bits;
-
- return v + offset;
-}
-
-__always_inline
-static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
-
- if (bits) {
- if (bits > state->bits) {
- bits -= state->bits;
- /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
- state->w |= (v >> 1) >> (bits - 1);
-
- *state->p = state->w;
- state->p = next_word(state->p);
- state->w = 0;
- state->bits = 64;
- }
-
- state->bits -= bits;
- state->w |= v << state->bits;
- }
-}
-
-__always_inline
-static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 offset = le64_to_cpu(state->format->field_offset[field]);
-
- if (v < offset)
- return false;
-
- v -= offset;
-
- if (fls64(v) > bits)
- return false;
-
- __set_inc_field(state, field, v);
- return true;
-}
-
-/*
- * Note: does NOT set out->format (we don't know what it should be here!)
- *
- * Also: doesn't work on extents - it doesn't preserve the invariant that
- * if k is packed bkey_start_pos(k) will successfully pack
- */
-static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
- struct bkey_packed *out,
- const struct bkey_format *in_f,
- const struct bkey_packed *in)
-{
- struct pack_state out_s = pack_state_init(out_f, out);
- struct unpack_state in_s = unpack_state_init(in_f, in);
- u64 *w = out->_data;
- unsigned i;
-
- *w = 0;
-
- for (i = 0; i < BKEY_NR_FIELDS; i++)
- if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
- return false;
-
- /* Can't happen because the val would be too big to unpack: */
- EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
-
- pack_state_finish(&out_s, out);
- out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s;
- out->needs_whiteout = in->needs_whiteout;
- out->type = in->type;
-
- return true;
-}
-
-bool bch2_bkey_transform(const struct bkey_format *out_f,
- struct bkey_packed *out,
- const struct bkey_format *in_f,
- const struct bkey_packed *in)
-{
- if (!bch2_bkey_transform_key(out_f, out, in_f, in))
- return false;
-
- memcpy_u64s((u64 *) out + out_f->key_u64s,
- (u64 *) in + in_f->key_u64s,
- (in->u64s - in_f->key_u64s));
- return true;
-}
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct unpack_state state = unpack_state_init(format, in);
- struct bkey out;
-
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
- EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
-
- out.u64s = BKEY_U64s + in->u64s - format->key_u64s;
- out.format = KEY_FORMAT_CURRENT;
- out.needs_whiteout = in->needs_whiteout;
- out.type = in->type;
- out.pad[0] = 0;
-
-#define x(id, field) out.field = get_inc_field(&state, id);
- bkey_fields()
-#undef x
-
- return out;
-}
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct unpack_state state = unpack_state_init(format, in);
- struct bpos out;
-
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
- out.inode = get_inc_field(&state, BKEY_FIELD_INODE);
- out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET);
- out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-
- return out;
-}
-#endif
-
-/**
- * bch2_bkey_pack_key -- pack just the key, not the value
- * @out: packed result
- * @in: key to pack
- * @format: format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
- const struct bkey_format *format)
-{
- struct pack_state state = pack_state_init(format, out);
- u64 *w = out->_data;
-
- EBUG_ON((void *) in == (void *) out);
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
- *w = 0;
-
-#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
- bkey_fields()
-#undef x
- pack_state_finish(&state, out);
- out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
- out->format = KEY_FORMAT_LOCAL_BTREE;
- out->needs_whiteout = in->needs_whiteout;
- out->type = in->type;
-
- bch2_bkey_pack_verify(out, in, format);
- return true;
-}
-
-/**
- * bch2_bkey_unpack -- unpack the key and the value
- * @b: btree node of @src key (for packed format)
- * @dst: unpacked result
- * @src: packed input
- */
-void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
- const struct bkey_packed *src)
-{
- __bkey_unpack_key(b, &dst->k, src);
-
- memcpy_u64s(&dst->v,
- bkeyp_val(&b->format, src),
- bkeyp_val_u64s(&b->format, src));
-}
-
-/**
- * bch2_bkey_pack -- pack the key and the value
- * @dst: packed result
- * @src: unpacked input
- * @format: format of packed result
- *
- * Returns: true on success, false on failure
- */
-bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
- const struct bkey_format *format)
-{
- struct bkey_packed tmp;
-
- if (!bch2_bkey_pack_key(&tmp, &src->k, format))
- return false;
-
- memmove_u64s((u64 *) dst + format->key_u64s,
- &src->v,
- bkey_val_u64s(&src->k));
- memcpy_u64s_small(dst, &tmp, format->key_u64s);
-
- return true;
-}
-
-__always_inline
-static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 offset = le64_to_cpu(state->format->field_offset[field]);
- bool ret = true;
-
- EBUG_ON(v < offset);
- v -= offset;
-
- if (fls64(v) > bits) {
- v = ~(~0ULL << bits);
- ret = false;
- }
-
- __set_inc_field(state, field, v);
- return ret;
-}
-
-static bool bkey_packed_successor(struct bkey_packed *out,
- const struct btree *b,
- struct bkey_packed k)
-{
- const struct bkey_format *f = &b->format;
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned first_bit, offset;
- u64 *p;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- if (!nr_key_bits)
- return false;
-
- *out = k;
-
- first_bit = high_bit_offset + nr_key_bits - 1;
- p = nth_word(high_word(f, out), first_bit >> 6);
- offset = 63 - (first_bit & 63);
-
- while (nr_key_bits) {
- unsigned bits = min(64 - offset, nr_key_bits);
- u64 mask = (~0ULL >> (64 - bits)) << offset;
-
- if ((*p & mask) != mask) {
- *p += 1ULL << offset;
- EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
- return true;
- }
-
- *p &= ~mask;
- p = prev_word(p);
- nr_key_bits -= bits;
- offset = 0;
- }
-
- return false;
-}
-
-static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
-{
- for (unsigned i = 0; i < f->nr_fields; i++) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 packed_max = f->bits_per_field[i]
- ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
- : 0;
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (packed_max + field_offset < packed_max ||
- packed_max + field_offset > unpacked_max)
- return true;
- }
-
- return false;
-}
-
-/*
- * Returns a packed key that compares <= in
- *
- * This is used in bset_search_tree(), where we need a packed pos in order to be
- * able to compare against the keys in the auxiliary search tree - and it's
- * legal to use a packed pos that isn't equivalent to the original pos,
- * _provided_ it compares <= to the original pos.
- */
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
- struct bpos in,
- const struct btree *b)
-{
- const struct bkey_format *f = &b->format;
- struct pack_state state = pack_state_init(f, out);
- u64 *w = out->_data;
- struct bpos orig = in;
- bool exact = true;
- unsigned i;
-
- /*
- * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
- * byte header, but pack_pos() won't if the len/version fields are big
- * enough - we need to make sure to zero them out:
- */
- for (i = 0; i < f->key_u64s; i++)
- w[i] = 0;
-
- if (unlikely(in.snapshot <
- le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
- if (!in.offset-- &&
- !in.inode--)
- return BKEY_PACK_POS_FAIL;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(in.offset <
- le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
- if (!in.inode--)
- return BKEY_PACK_POS_FAIL;
- in.offset = KEY_OFFSET_MAX;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(in.inode <
- le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
- return BKEY_PACK_POS_FAIL;
-
- if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
- in.offset = KEY_OFFSET_MAX;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
- exact = false;
-
- pack_state_finish(&state, out);
- out->u64s = f->key_u64s;
- out->format = KEY_FORMAT_LOCAL_BTREE;
- out->type = KEY_TYPE_deleted;
-
- if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
- if (exact) {
- BUG_ON(bkey_cmp_left_packed(b, out, &orig));
- } else {
- struct bkey_packed successor;
-
- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
- BUG_ON(bkey_packed_successor(&successor, b, *out) &&
- bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
- !bkey_format_has_too_big_fields(f));
- }
- }
-
- return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
-}
-
-void bch2_bkey_format_init(struct bkey_format_state *s)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
- s->field_min[i] = U64_MAX;
-
- for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
- s->field_max[i] = 0;
-
- /* Make sure we can store a size of 0: */
- s->field_min[BKEY_FIELD_SIZE] = 0;
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
-{
- unsigned field = 0;
-
- __bkey_format_add(s, field++, p.inode);
- __bkey_format_add(s, field++, p.offset);
- __bkey_format_add(s, field++, p.snapshot);
-}
-
-/*
- * We don't want it to be possible for the packed format to represent fields
- * bigger than a u64... that will cause confusion and issues (like with
- * bkey_packed_successor())
- */
-static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
- unsigned bits, u64 offset)
-{
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-
- bits = min(bits, unpacked_bits);
-
- offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
-
- f->bits_per_field[i] = bits;
- f->field_offset[i] = cpu_to_le64(offset);
-}
-
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
-{
- unsigned i, bits = KEY_PACKED_BITS_START;
- struct bkey_format ret = {
- .nr_fields = BKEY_NR_FIELDS,
- };
-
- for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
- s->field_min[i] = min(s->field_min[i], s->field_max[i]);
-
- set_format_field(&ret, i,
- fls64(s->field_max[i] - s->field_min[i]),
- s->field_min[i]);
-
- bits += ret.bits_per_field[i];
- }
-
- /* allow for extent merging: */
- if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
- unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
-
- ret.bits_per_field[BKEY_FIELD_SIZE] += b;
- bits += b;
- }
-
- ret.key_u64s = DIV_ROUND_UP(bits, 64);
-
- /* if we have enough spare bits, round fields up to nearest byte */
- bits = ret.key_u64s * 64 - bits;
-
- for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
- unsigned r = round_up(ret.bits_per_field[i], 8) -
- ret.bits_per_field[i];
-
- if (r <= bits) {
- set_format_field(&ret, i,
- ret.bits_per_field[i] + r,
- le64_to_cpu(ret.field_offset[i]));
- bits -= r;
- }
- }
-
- if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
- struct printbuf buf = PRINTBUF;
-
- BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
- printbuf_exit(&buf);
- }
-
- return ret;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *c,
- struct bkey_format *f,
- enum bch_validate_flags flags,
- struct printbuf *err)
-{
- unsigned bits = KEY_PACKED_BITS_START;
-
- if (f->nr_fields != BKEY_NR_FIELDS) {
- prt_printf(err, "incorrect number of fields: got %u, should be %u",
- f->nr_fields, BKEY_NR_FIELDS);
- return -BCH_ERR_invalid;
- }
-
- /*
- * Verify that the packed format can't represent fields larger than the
- * unpacked format:
- */
- for (unsigned i = 0; i < f->nr_fields; i++) {
- if (bch2_bkey_format_field_overflows(f, i)) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- unsigned packed_bits = min(64, f->bits_per_field[i]);
- u64 packed_max = packed_bits
- ? ~((~0ULL << 1) << (packed_bits - 1))
- : 0;
-
- prt_printf(err, "field %u too large: %llu + %llu > %llu",
- i, packed_max, le64_to_cpu(f->field_offset[i]), unpacked_max);
- return -BCH_ERR_invalid;
- }
-
- bits += f->bits_per_field[i];
- }
-
- if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
- prt_printf(err, "incorrect key_u64s: got %u, should be %u",
- f->key_u64s, DIV_ROUND_UP(bits, 64));
- return -BCH_ERR_invalid;
- }
-
- return 0;
-}
-
-void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
-{
- prt_printf(out, "u64s %u fields ", f->key_u64s);
-
- for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
- if (i)
- prt_str(out, ", ");
- prt_printf(out, "%u:%llu",
- f->bits_per_field[i],
- le64_to_cpu(f->field_offset[i]));
- }
-}
-
-/*
- * Most significant differing bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
- const struct bkey_packed *l_k,
- const struct bkey_packed *r_k)
-{
- const u64 *l = high_word(&b->format, l_k);
- const u64 *r = high_word(&b->format, r_k);
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned word_bits = 64 - high_bit_offset;
- u64 l_v, r_v;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
- /* for big endian, skip past header */
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (nr_key_bits) {
- if (nr_key_bits < word_bits) {
- l_v >>= word_bits - nr_key_bits;
- r_v >>= word_bits - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= word_bits;
- }
-
- if (l_v != r_v)
- return fls64(l_v ^ r_v) - 1 + nr_key_bits;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- word_bits = 64;
- }
-
- return 0;
-}
-
-/*
- * First set bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
-{
- const u64 *p = high_word(&b->format, k);
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned ret = 0, offset;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
- offset = nr_key_bits;
- while (offset > 64) {
- p = next_word(p);
- offset -= 64;
- }
-
- offset = 64 - offset;
-
- while (nr_key_bits) {
- unsigned bits = nr_key_bits + offset < 64
- ? nr_key_bits
- : 64 - offset;
-
- u64 mask = (~0ULL >> (64 - bits)) << offset;
-
- if (*p & mask)
- return ret + __ffs64(*p & mask) - offset;
-
- p = prev_word(p);
- nr_key_bits -= bits;
- ret += bits;
- offset = 0;
- }
-
- return 0;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-#define I(_x) (*(out)++ = (_x))
-#define I1(i0) I(i0)
-#define I2(i0, i1) (I1(i0), I(i1))
-#define I3(i0, i1, i2) (I2(i0, i1), I(i2))
-#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3))
-#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4))
-
-static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
- enum bch_bkey_fields field,
- unsigned dst_offset, unsigned dst_size,
- bool *eax_zeroed)
-{
- unsigned bits = format->bits_per_field[field];
- u64 offset = le64_to_cpu(format->field_offset[field]);
- unsigned i, byte, bit_offset, align, shl, shr;
-
- if (!bits && !offset) {
- if (!*eax_zeroed) {
- /* xor eax, eax */
- I2(0x31, 0xc0);
- }
-
- *eax_zeroed = true;
- goto set_field;
- }
-
- if (!bits) {
- /* just return offset: */
-
- switch (dst_size) {
- case 8:
- if (offset > S32_MAX) {
- /* mov [rdi + dst_offset], offset */
- I3(0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
-
- I3(0xc7, 0x47, dst_offset + 4);
- memcpy(out, (void *) &offset + 4, 4);
- out += 4;
- } else {
- /* mov [rdi + dst_offset], offset */
- /* sign extended */
- I4(0x48, 0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
- }
- break;
- case 4:
- /* mov [rdi + dst_offset], offset */
- I3(0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
- break;
- default:
- BUG();
- }
-
- return out;
- }
-
- bit_offset = format->key_u64s * 64;
- for (i = 0; i <= field; i++)
- bit_offset -= format->bits_per_field[i];
-
- byte = bit_offset / 8;
- bit_offset -= byte * 8;
-
- *eax_zeroed = false;
-
- if (bit_offset == 0 && bits == 8) {
- /* movzx eax, BYTE PTR [rsi + imm8] */
- I4(0x0f, 0xb6, 0x46, byte);
- } else if (bit_offset == 0 && bits == 16) {
- /* movzx eax, WORD PTR [rsi + imm8] */
- I4(0x0f, 0xb7, 0x46, byte);
- } else if (bit_offset + bits <= 32) {
- align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
- byte -= align;
- bit_offset += align * 8;
-
- BUG_ON(bit_offset + bits > 32);
-
- /* mov eax, [rsi + imm8] */
- I3(0x8b, 0x46, byte);
-
- if (bit_offset) {
- /* shr eax, imm8 */
- I3(0xc1, 0xe8, bit_offset);
- }
-
- if (bit_offset + bits < 32) {
- unsigned mask = ~0U >> (32 - bits);
-
- /* and eax, imm32 */
- I1(0x25);
- memcpy(out, &mask, 4);
- out += 4;
- }
- } else if (bit_offset + bits <= 64) {
- align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
- byte -= align;
- bit_offset += align * 8;
-
- BUG_ON(bit_offset + bits > 64);
-
- /* mov rax, [rsi + imm8] */
- I4(0x48, 0x8b, 0x46, byte);
-
- shl = 64 - bit_offset - bits;
- shr = bit_offset + shl;
-
- if (shl) {
- /* shl rax, imm8 */
- I4(0x48, 0xc1, 0xe0, shl);
- }
-
- if (shr) {
- /* shr rax, imm8 */
- I4(0x48, 0xc1, 0xe8, shr);
- }
- } else {
- align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
- byte -= align;
- bit_offset += align * 8;
-
- BUG_ON(bit_offset + bits > 96);
-
- /* mov rax, [rsi + byte] */
- I4(0x48, 0x8b, 0x46, byte);
-
- /* mov edx, [rsi + byte + 8] */
- I3(0x8b, 0x56, byte + 8);
-
- /* bits from next word: */
- shr = bit_offset + bits - 64;
- BUG_ON(shr > bit_offset);
-
- /* shr rax, bit_offset */
- I4(0x48, 0xc1, 0xe8, shr);
-
- /* shl rdx, imm8 */
- I4(0x48, 0xc1, 0xe2, 64 - shr);
-
- /* or rax, rdx */
- I3(0x48, 0x09, 0xd0);
-
- shr = bit_offset - shr;
-
- if (shr) {
- /* shr rax, imm8 */
- I4(0x48, 0xc1, 0xe8, shr);
- }
- }
-
- /* rax += offset: */
- if (offset > S32_MAX) {
- /* mov rdx, imm64 */
- I2(0x48, 0xba);
- memcpy(out, &offset, 8);
- out += 8;
- /* add %rdx, %rax */
- I3(0x48, 0x01, 0xd0);
- } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
- /* add rax, imm32 */
- I2(0x48, 0x05);
- memcpy(out, &offset, 4);
- out += 4;
- } else if (offset) {
- /* add eax, imm32 */
- I1(0x05);
- memcpy(out, &offset, 4);
- out += 4;
- }
-set_field:
- switch (dst_size) {
- case 8:
- /* mov [rdi + dst_offset], rax */
- I4(0x48, 0x89, 0x47, dst_offset);
- break;
- case 4:
- /* mov [rdi + dst_offset], eax */
- I3(0x89, 0x47, dst_offset);
- break;
- default:
- BUG();
- }
-
- return out;
-}
-
-int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
-{
- bool eax_zeroed = false;
- u8 *out = _out;
-
- /*
- * rdi: dst - unpacked key
- * rsi: src - packed key
- */
-
- /* k->u64s, k->format, k->type */
-
- /* mov eax, [rsi] */
- I2(0x8b, 0x06);
-
- /* add eax, BKEY_U64s - format->key_u64s */
- I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
-
- /* and eax, imm32: mask out k->pad: */
- I5(0x25, 0xff, 0xff, 0xff, 0);
-
- /* mov [rdi], eax */
- I2(0x89, 0x07);
-
-#define x(id, field) \
- out = compile_bkey_field(format, out, id, \
- offsetof(struct bkey, field), \
- sizeof(((struct bkey *) NULL)->field), \
- &eax_zeroed);
- bkey_fields()
-#undef x
-
- /* retq */
- I1(0xc3);
-
- return (void *) out - _out;
-}
-
-#else
-#endif
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
-{
- return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
-}
-
-__pure __flatten
-int bch2_bkey_cmp_packed(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed_inlined(b, l, r);
-}
-
-__pure __flatten
-int __bch2_bkey_cmp_left_packed(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- const struct bkey *l_unpacked;
-
- return unlikely(l_unpacked = packed_to_bkey_c(l))
- ? bpos_cmp(l_unpacked->p, *r)
- : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-void bch2_bpos_swab(struct bpos *p)
-{
- u8 *l = (u8 *) p;
- u8 *h = ((u8 *) &p[1]) - 1;
-
- while (l < h) {
- swap(*l, *h);
- l++;
- --h;
- }
-}
-
-void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
-{
- const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
- u8 *l = k->key_start;
- u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;
-
- while (l < h) {
- swap(*l, *h);
- l++;
- --h;
- }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void)
-{
- struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
- struct bkey_packed p;
-
- struct bkey_format test_format = {
- .key_u64s = 3,
- .nr_fields = BKEY_NR_FIELDS,
- .bits_per_field = {
- 13,
- 64,
- 32,
- },
- };
-
- struct unpack_state in_s =
- unpack_state_init(&bch2_bkey_format_current, (void *) &t);
- struct pack_state out_s = pack_state_init(&test_format, &p);
- unsigned i;
-
- for (i = 0; i < out_s.format->nr_fields; i++) {
- u64 a, v = get_inc_field(&in_s, i);
-
- switch (i) {
-#define x(id, field) case id: a = t.field; break;
- bkey_fields()
-#undef x
- default:
- BUG();
- }
-
- if (a != v)
- panic("got %llu actual %llu i %u\n", v, a, i);
-
- if (!set_inc_field(&out_s, i, v))
- panic("failed at %u\n", i);
- }
-
- BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
-}
-#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
deleted file mode 100644
index 3ccd521c190a..000000000000
--- a/fs/bcachefs/bkey.h
+++ /dev/null
@@ -1,605 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_H
-#define _BCACHEFS_BKEY_H
-
-#include <linux/bug.h>
-#include "bcachefs_format.h"
-#include "bkey_types.h"
-#include "btree_types.h"
-#include "util.h"
-#include "vstructs.h"
-
-#if 0
-
-/*
- * compiled unpack functions are disabled, pending a new interface for
- * dynamically allocating executable memory:
- */
-
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHEFS_COMPILED_UNPACK 1
-#endif
-#endif
-
-void bch2_bkey_packed_to_binary_text(struct printbuf *,
- const struct bkey_format *,
- const struct bkey_packed *);
-
-enum bkey_lr_packed {
- BKEY_PACKED_BOTH,
- BKEY_PACKED_RIGHT,
- BKEY_PACKED_LEFT,
- BKEY_PACKED_NONE,
-};
-
-#define bkey_lr_packed(_l, _r) \
- ((_l)->format + ((_r)->format << 1))
-
-static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
-{
- memcpy_u64s_small(dst, src, src->u64s);
-}
-
-static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
-{
- memcpy_u64s_small(dst, src, src->k.u64s);
-}
-
-struct btree;
-
-__pure
-unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-__pure
-unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
-
-__pure
-int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
- const struct bkey_packed *,
- const struct bpos *);
-
-__pure
-int bch2_bkey_cmp_packed(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-
-__pure
-int __bch2_bkey_cmp_left_packed(const struct btree *,
- const struct bkey_packed *,
- const struct bpos *);
-
-static inline __pure
-int bkey_cmp_left_packed(const struct btree *b,
- const struct bkey_packed *l, const struct bpos *r)
-{
- return __bch2_bkey_cmp_left_packed(b, l, r);
-}
-
-/*
- * The compiler generates better code when we pass bpos by ref, but it's often
- * enough terribly convenient to pass it by val... as much as I hate c++, const
- * ref would be nice here:
- */
-__pure __flatten
-static inline int bkey_cmp_left_packed_byval(const struct btree *b,
- const struct bkey_packed *l,
- struct bpos r)
-{
- return bkey_cmp_left_packed(b, l, &r);
-}
-
-static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
-{
- return !((l.inode ^ r.inode) |
- (l.offset ^ r.offset) |
- (l.snapshot ^ r.snapshot));
-}
-
-static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode ? l.inode < r.inode :
- l.offset != r.offset ? l.offset < r.offset :
- l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
-}
-
-static __always_inline bool bpos_le(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode ? l.inode < r.inode :
- l.offset != r.offset ? l.offset < r.offset :
- l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
-}
-
-static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
-{
- return bpos_lt(r, l);
-}
-
-static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
-{
- return bpos_le(r, l);
-}
-
-static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
-{
- return cmp_int(l.inode, r.inode) ?:
- cmp_int(l.offset, r.offset) ?:
- cmp_int(l.snapshot, r.snapshot);
-}
-
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
-{
- return bpos_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
-{
- return bpos_gt(l, r) ? l : r;
-}
-
-static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
-{
- return !((l.inode ^ r.inode) |
- (l.offset ^ r.offset));
-}
-
-static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode
- ? l.inode < r.inode
- : l.offset < r.offset;
-}
-
-static __always_inline bool bkey_le(struct bpos l, struct bpos r)
-{
- return l.inode != r.inode
- ? l.inode < r.inode
- : l.offset <= r.offset;
-}
-
-static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
-{
- return bkey_lt(r, l);
-}
-
-static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
-{
- return bkey_le(r, l);
-}
-
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
-{
- return cmp_int(l.inode, r.inode) ?:
- cmp_int(l.offset, r.offset);
-}
-
-static inline struct bpos bkey_min(struct bpos l, struct bpos r)
-{
- return bkey_lt(l, r) ? l : r;
-}
-
-static inline struct bpos bkey_max(struct bpos l, struct bpos r)
-{
- return bkey_gt(l, r) ? l : r;
-}
-
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
- return bpos_eq(l.k->p, r.k->p) &&
- l.k->size == r.k->size &&
- bkey_bytes(l.k) == bkey_bytes(r.k) &&
- !memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
-void bch2_bpos_swab(struct bpos *);
-void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
-
-static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
-{
- return cmp_int(l.hi, r.hi) ?:
- cmp_int(l.lo, r.lo);
-}
-
-#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
-#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-
-static __always_inline bool bversion_zero(struct bversion v)
-{
- return bversion_cmp(v, ZERO_VERSION) == 0;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-/* statement expressions confusing unlikely()? */
-#define bkey_packed(_k) \
- ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
- (_k)->format != KEY_FORMAT_CURRENT; })
-#else
-#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
-#endif
-
-/*
- * It's safe to treat an unpacked bkey as a packed one, but not the reverse
- */
-static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
-{
- return (struct bkey_packed *) k;
-}
-
-static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
-{
- return (const struct bkey_packed *) k;
-}
-
-static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
-{
- return bkey_packed(k) ? NULL : (struct bkey_i *) k;
-}
-
-static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
-{
- return bkey_packed(k) ? NULL : (const struct bkey *) k;
-}
-
-static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
-{
- return format->bits_per_field[BKEY_FIELD_INODE] +
- format->bits_per_field[BKEY_FIELD_OFFSET] +
- format->bits_per_field[BKEY_FIELD_SNAPSHOT];
-}
-
-static inline struct bpos bpos_successor(struct bpos p)
-{
- if (!++p.snapshot &&
- !++p.offset &&
- !++p.inode)
- BUG();
-
- return p;
-}
-
-static inline struct bpos bpos_predecessor(struct bpos p)
-{
- if (!p.snapshot-- &&
- !p.offset-- &&
- !p.inode--)
- BUG();
-
- return p;
-}
-
-static inline struct bpos bpos_nosnap_successor(struct bpos p)
-{
- p.snapshot = 0;
-
- if (!++p.offset &&
- !++p.inode)
- BUG();
-
- return p;
-}
-
-static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
-{
- p.snapshot = 0;
-
- if (!p.offset-- &&
- !p.inode--)
- BUG();
-
- return p;
-}
-
-static inline u64 bkey_start_offset(const struct bkey *k)
-{
- return k->p.offset - k->size;
-}
-
-static inline struct bpos bkey_start_pos(const struct bkey *k)
-{
- return (struct bpos) {
- .inode = k->p.inode,
- .offset = bkey_start_offset(k),
- .snapshot = k->p.snapshot,
- };
-}
-
-/* Packed helpers */
-
-static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-}
-
-static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
- const struct bkey_packed *k)
-{
- return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
-}
-
-static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkeyp_key_u64s(format, k) * sizeof(u64);
-}
-
-static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return k->u64s - bkeyp_key_u64s(format, k);
-}
-
-static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkeyp_val_u64s(format, k) * sizeof(u64);
-}
-
-static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
- struct bkey_packed *k, unsigned val_u64s)
-{
- k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
-}
-
-#define bkeyp_val(_format, _k) \
- ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
-
-extern const struct bkey_format bch2_bkey_format_current;
-
-bool bch2_bkey_transform(const struct bkey_format *,
- struct bkey_packed *,
- const struct bkey_format *,
- const struct bkey_packed *);
-
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
- const struct bkey_packed *);
-
-#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *,
- const struct bkey_packed *);
-#endif
-
-bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
- const struct bkey_format *);
-
-enum bkey_pack_pos_ret {
- BKEY_PACK_POS_EXACT,
- BKEY_PACK_POS_SMALLER,
- BKEY_PACK_POS_FAIL,
-};
-
-enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
- const struct btree *);
-
-static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
- const struct btree *b)
-{
- return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
-}
-
-void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
- const struct bkey_packed *);
-bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
- const struct bkey_format *);
-
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
- if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
- compiled_unpack_fn unpack_fn = b->aux_data;
- unpack_fn(dst, src);
-
- if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
- struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
- BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
- }
- } else {
- *dst = __bch2_bkey_unpack_key(&b->format, src);
- }
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
- struct bkey dst;
-
- __bkey_unpack_key_format_checked(b, &dst, src);
- return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
- if (likely(bkey_packed(src)))
- __bkey_unpack_key_format_checked(b, dst, src);
- else
- *dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_key_format_checked(b, src)
- : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
- return bkey_unpack_key_format_checked(b, src).p;
-#else
- return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_pos_format_checked(b, src)
- : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
- const struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(const struct btree *b,
- struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
-static inline u64 bkey_field_max(const struct bkey_format *f,
- enum bch_bkey_fields nr)
-{
- return f->bits_per_field[nr] < 64
- ? (le64_to_cpu(f->field_offset[nr]) +
- ~(~0ULL << f->bits_per_field[nr]))
- : U64_MAX;
-}
-
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-
-int bch2_compile_bkey_format(const struct bkey_format *, void *);
-
-#else
-
-static inline int bch2_compile_bkey_format(const struct bkey_format *format,
- void *out) { return 0; }
-
-#endif
-
-static inline void bkey_reassemble(struct bkey_i *dst,
- struct bkey_s_c src)
-{
- dst->k = *src.k;
- memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-/* byte order helpers */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
- return f->key_u64s - 1;
-}
-
-#define high_bit_offset 0
-#define nth_word(p, n) ((p) - (n))
-
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-
-static inline unsigned high_word_offset(const struct bkey_format *f)
-{
- return 0;
-}
-
-#define high_bit_offset KEY_PACKED_BITS_START
-#define nth_word(p, n) ((p) + (n))
-
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f))
-#define next_word(p) nth_word(p, 1)
-#define prev_word(p) nth_word(p, -1)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_bkey_pack_test(void);
-#else
-static inline void bch2_bkey_pack_test(void) {}
-#endif
-
-#define bkey_fields() \
- x(BKEY_FIELD_INODE, p.inode) \
- x(BKEY_FIELD_OFFSET, p.offset) \
- x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
- x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, bversion.hi) \
- x(BKEY_FIELD_VERSION_LO, bversion.lo)
-
-struct bkey_format_state {
- u64 field_min[BKEY_NR_FIELDS];
- u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-
-static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
-{
- s->field_min[field] = min(s->field_min[field], v);
- s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
- bkey_fields()
-#undef x
-}
-
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-
-static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i)
-{
- unsigned f_bits = f->bits_per_field[i];
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (f_bits > unpacked_bits)
- return true;
-
- if ((f_bits == unpacked_bits) && field_offset)
- return true;
-
- u64 f_mask = f_bits
- ? ~((~0ULL << (f_bits - 1)) << 1)
- : 0;
-
- if (((field_offset + f_mask) & unpacked_mask) < field_offset)
- return true;
- return false;
-}
-
-int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
- enum bch_validate_flags, struct printbuf *);
-void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
-
-#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
deleted file mode 100644
index a30c4ae8eb36..000000000000
--- a/fs/bcachefs/bkey_buf.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_BUF_H
-#define _BCACHEFS_BKEY_BUF_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-
-struct bkey_buf {
- struct bkey_i *k;
- u64 onstack[12];
-};
-
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
- struct bch_fs *c, unsigned u64s)
-{
- if (s->k == (void *) s->onstack &&
- u64s > ARRAY_SIZE(s->onstack)) {
- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
- memcpy(s->k, s->onstack, sizeof(s->onstack));
- }
-}
-
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_buf_realloc(s, c, k.k->u64s);
- bkey_reassemble(s->k, k);
-}
-
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_i *src)
-{
- bch2_bkey_buf_realloc(s, c, src->k.u64s);
- bkey_copy(s->k, src);
-}
-
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
- struct bch_fs *c,
- struct btree *b,
- struct bkey_packed *src)
-{
- bch2_bkey_buf_realloc(s, c, BKEY_U64s +
- bkeyp_val_u64s(&b->format, src));
- bch2_bkey_unpack(b, s->k, src);
-}
-
-static inline void bch2_bkey_buf_init(struct bkey_buf *s)
-{
- s->k = (void *) s->onstack;
-}
-
-static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
-{
- if (s->k != (void *) s->onstack)
- mempool_free(s->k, &c->large_bkey_pool);
- s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
deleted file mode 100644
index 5f42a6e69360..000000000000
--- a/fs/bcachefs/bkey_cmp.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_CMP_H
-#define _BCACHEFS_BKEY_CMP_H
-
-#include "bkey.h"
-
-#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- long d0, d1, d2, d3;
- int cmp;
-
- /* we shouldn't need asm for this, but gcc is being retarded: */
-
- asm(".intel_syntax noprefix;"
- "xor eax, eax;"
- "xor edx, edx;"
- "1:;"
- "mov r8, [rdi];"
- "mov r9, [rsi];"
- "sub ecx, 64;"
- "jl 2f;"
-
- "cmp r8, r9;"
- "jnz 3f;"
-
- "lea rdi, [rdi - 8];"
- "lea rsi, [rsi - 8];"
- "jmp 1b;"
-
- "2:;"
- "not ecx;"
- "shr r8, 1;"
- "shr r9, 1;"
- "shr r8, cl;"
- "shr r9, cl;"
- "cmp r8, r9;"
-
- "3:\n"
- "seta al;"
- "setb dl;"
- "sub eax, edx;"
- ".att_syntax prefix;"
- : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
- : "0" (l), "1" (r), "3" (nr_key_bits)
- : "r8", "r9", "cc", "memory");
-
- return cmp;
-}
-#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- u64 l_v, r_v;
-
- if (!nr_key_bits)
- return 0;
-
- /* for big endian, skip past header */
- nr_key_bits += high_bit_offset;
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (1) {
- if (nr_key_bits < 64) {
- l_v >>= 64 - nr_key_bits;
- r_v >>= 64 - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= 64;
- }
-
- if (!nr_key_bits || l_v != r_v)
- break;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- }
-
- return cmp_int(l_v, r_v);
-}
-#endif
-
-static inline __pure __flatten
-int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
-{
- const struct bkey_format *f = &b->format;
- int ret;
-
- EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- ret = __bkey_cmp_bits(high_word(f, l),
- high_word(f, r),
- b->nr_key_bits);
-
- EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
- bkey_unpack_pos(b, r)));
- return ret;
-}
-
-static inline __pure __flatten
-int bch2_bkey_cmp_packed_inlined(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- struct bkey unpacked;
-
- if (likely(bkey_packed(l) && bkey_packed(r)))
- return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
-
- if (bkey_packed(l)) {
- __bkey_unpack_key_format_checked(b, &unpacked, l);
- l = (void *) &unpacked;
- } else if (bkey_packed(r)) {
- __bkey_unpack_key_format_checked(b, &unpacked, r);
- r = (void *) &unpacked;
- }
-
- return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
-}
-
-#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
deleted file mode 100644
index fcd8c82cba4f..000000000000
--- a/fs/bcachefs/bkey_methods.c
+++ /dev/null
@@ -1,497 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_types.h"
-#include "alloc_background.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "lru.h"
-#include "quota.h"
-#include "reflink.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-const char * const bch2_bkey_types[] = {
-#define x(name, nr, ...) #name,
- BCH_BKEY_TYPES()
-#undef x
- NULL
-};
-
-static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-#define bch2_bkey_ops_deleted ((struct bkey_ops) { \
- .key_validate = deleted_key_validate, \
-})
-
-#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \
- .key_validate = deleted_key_validate, \
-})
-
-static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_bytes(k.k),
- c, bkey_val_size_nonzero,
- "incorrect value size (%zu != 0)",
- bkey_val_bytes(k.k));
-fsck_err:
- return ret;
-}
-
-#define bch2_bkey_ops_error ((struct bkey_ops) { \
- .key_validate = empty_val_key_validate, \
-})
-
-static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
-
- prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
-}
-
-#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
- .key_validate = key_type_cookie_validate, \
- .val_to_text = key_type_cookie_to_text, \
- .min_val_size = 8, \
-})
-
-#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
- .key_validate = empty_val_key_validate, \
-})
-
-static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
- unsigned datalen = bkey_inline_data_bytes(k.k);
-
- prt_printf(out, "datalen %u: %*phN",
- datalen, min(datalen, 32U), d.v->data);
-}
-
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
- .key_validate = key_type_inline_data_validate, \
- .val_to_text = key_type_inline_data_to_text, \
-})
-
-static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-#define bch2_bkey_ops_set ((struct bkey_ops) { \
- .key_validate = empty_val_key_validate, \
- .key_merge = key_type_set_merge, \
-})
-
-const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
- BCH_BKEY_TYPES()
-#undef x
-};
-
-const struct bkey_ops bch2_bkey_null_ops = {
-};
-
-int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
- return 0;
-
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
- c, bkey_val_size_too_small,
- "bad val size (%zu < %u)",
- bkey_val_bytes(k.k), ops->min_val_size);
-
- if (!ops->key_validate)
- return 0;
-
- ret = ops->key_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-static u64 bch2_key_types_allowed[] = {
- [BKEY_TYPE_btree] =
- BIT_ULL(KEY_TYPE_deleted)|
- BIT_ULL(KEY_TYPE_btree_ptr)|
- BIT_ULL(KEY_TYPE_btree_ptr_v2),
-#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
- BCH_BTREE_IDS()
-#undef x
-};
-
-static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
-#define x(name, nr, flags) [KEY_TYPE_##name] = flags,
- BCH_BKEY_TYPES()
-#undef x
-};
-
-const char *bch2_btree_node_type_str(enum btree_node_type type)
-{
- return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
-}
-
-int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- enum btree_node_type type = __btree_node_type(from.level, from.btree);
-
- if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
- return 0;
-
- int ret = 0;
-
- bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
- c, bkey_u64s_too_small,
- "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-
- if (type >= BKEY_TYPE_NR)
- return 0;
-
- enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
- ? bch2_bkey_type_flags[k.k->type]
- : 0;
-
- bool strict_key_type_allowed =
- (from.flags & BCH_VALIDATE_commit) ||
- type == BKEY_TYPE_btree ||
- (from.btree < BTREE_ID_NR &&
- (bkey_flags & BKEY_TYPE_strict_btree_checks));
-
- bkey_fsck_err_on(strict_key_type_allowed &&
- k.k->type < KEY_TYPE_MAX &&
- !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
- c, bkey_invalid_type_for_btree,
- "invalid key type for btree %s (%s)",
- bch2_btree_node_type_str(type),
- k.k->type < KEY_TYPE_MAX
- ? bch2_bkey_types[k.k->type]
- : "(unknown)");
-
- if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- bkey_fsck_err_on(k.k->size == 0,
- c, bkey_extent_size_zero,
- "size == 0");
-
- bkey_fsck_err_on(k.k->size > k.k->p.offset,
- c, bkey_extent_size_greater_than_offset,
- "size greater than offset (%u > %llu)",
- k.k->size, k.k->p.offset);
- } else {
- bkey_fsck_err_on(k.k->size,
- c, bkey_size_nonzero,
- "size != 0");
- }
-
- if (type != BKEY_TYPE_btree) {
- enum btree_id btree = type - 1;
-
- if (btree_type_has_snapshots(btree)) {
- bkey_fsck_err_on(!k.k->p.snapshot,
- c, bkey_snapshot_zero,
- "snapshot == 0");
- } else if (!btree_type_has_snapshot_field(btree)) {
- bkey_fsck_err_on(k.k->p.snapshot,
- c, bkey_snapshot_nonzero,
- "nonzero snapshot");
- } else {
- /*
- * btree uses snapshot field but it's not required to be
- * nonzero
- */
- }
-
- bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
- c, bkey_at_pos_max,
- "key at POS_MAX");
- }
-fsck_err:
- return ret;
-}
-
-int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return __bch2_bkey_validate(c, k, from) ?:
- bch2_bkey_val_validate(c, k, from);
-}
-
-int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
- c, bkey_before_start_of_btree_node,
- "key before start of btree node");
-
- bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
- c, bkey_after_end_of_btree_node,
- "key past end of btree node");
-fsck_err:
- return ret;
-}
-
-void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
-{
- if (bpos_eq(pos, POS_MIN))
- prt_printf(out, "POS_MIN");
- else if (bpos_eq(pos, POS_MAX))
- prt_printf(out, "POS_MAX");
- else if (bpos_eq(pos, SPOS_MAX))
- prt_printf(out, "SPOS_MAX");
- else {
- if (pos.inode == U64_MAX)
- prt_printf(out, "U64_MAX");
- else
- prt_printf(out, "%llu", pos.inode);
- prt_printf(out, ":");
- if (pos.offset == U64_MAX)
- prt_printf(out, "U64_MAX");
- else
- prt_printf(out, "%llu", pos.offset);
- prt_printf(out, ":");
- if (pos.snapshot == U32_MAX)
- prt_printf(out, "U32_MAX");
- else
- prt_printf(out, "%u", pos.snapshot);
- }
-}
-
-void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
-{
- if (k) {
- prt_printf(out, "u64s %u type ", k->u64s);
-
- if (k->type < KEY_TYPE_MAX)
- prt_printf(out, "%s ", bch2_bkey_types[k->type]);
- else
- prt_printf(out, "%u ", k->type);
-
- bch2_bpos_to_text(out, k->p);
-
- prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
- } else {
- prt_printf(out, "(null)");
- }
-}
-
-void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- if (likely(ops->val_to_text))
- ops->val_to_text(out, c, k);
-}
-
-void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_to_text(out, k.k);
-
- if (bkey_val_bytes(k.k)) {
- prt_printf(out, ": ");
- bch2_val_to_text(out, c, k);
- }
-}
-
-void bch2_bkey_swab_val(struct bkey_s k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- if (ops->swab)
- ops->swab(k);
-}
-
-bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- return ops->key_normalize
- ? ops->key_normalize(c, k)
- : false;
-}
-
-bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
-
- return ops->key_merge &&
- bch2_bkey_maybe_mergable(l.k, r.k) &&
- (u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
- !static_branch_unlikely(&bch2_key_merging_disabled) &&
- ops->key_merge(c, l, r);
-}
-
-static const struct old_bkey_type {
- u8 btree_node_type;
- u8 old;
- u8 new;
-} bkey_renumber_table[] = {
- {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr },
- {BKEY_TYPE_extents, 128, KEY_TYPE_extent },
- {BKEY_TYPE_extents, 129, KEY_TYPE_extent },
- {BKEY_TYPE_extents, 130, KEY_TYPE_reservation },
- {BKEY_TYPE_inodes, 128, KEY_TYPE_inode },
- {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation },
- {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent },
- {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout },
- {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr },
- {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout },
- {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc },
- {BKEY_TYPE_quotas, 128, KEY_TYPE_quota },
-};
-
-void bch2_bkey_renumber(enum btree_node_type btree_node_type,
- struct bkey_packed *k,
- int write)
-{
- const struct old_bkey_type *i;
-
- for (i = bkey_renumber_table;
- i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
- i++)
- if (btree_node_type == i->btree_node_type &&
- k->type == (write ? i->new : i->old)) {
- k->type = write ? i->old : i->new;
- break;
- }
-}
-
-void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write,
- struct bkey_format *f,
- struct bkey_packed *k)
-{
- const struct bkey_ops *ops;
- struct bkey uk;
- unsigned nr_compat = 5;
- int i;
-
- /*
- * Do these operations in reverse order in the write path:
- */
-
- for (i = 0; i < nr_compat; i++)
- switch (!write ? i : nr_compat - 1 - i) {
- case 0:
- if (big_endian != CPU_BIG_ENDIAN) {
- bch2_bkey_swab_key(f, k);
- } else if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- bch2_bkey_swab_key(f, k);
- bch2_bkey_swab_key(f, k);
- }
- break;
- case 1:
- if (version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
- break;
- case 2:
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_inodes) {
- if (!bkey_packed(k)) {
- struct bkey_i *u = packed_to_bkey(k);
-
- swap(u->k.p.inode, u->k.p.offset);
- } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
- f->bits_per_field[BKEY_FIELD_OFFSET]) {
- struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
- swap(tmp.bits_per_field[BKEY_FIELD_INODE],
- tmp.bits_per_field[BKEY_FIELD_OFFSET]);
- swap(tmp.field_offset[BKEY_FIELD_INODE],
- tmp.field_offset[BKEY_FIELD_OFFSET]);
-
- if (!write)
- swap(in, out);
-
- uk = __bch2_bkey_unpack_key(in, k);
- swap(uk.p.inode, uk.p.offset);
- BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
- }
- }
- break;
- case 3:
- if (version < bcachefs_metadata_version_snapshot &&
- (level || btree_type_has_snapshots(btree_id))) {
- struct bkey_i *u = packed_to_bkey(k);
-
- if (u) {
- u->k.p.snapshot = write
- ? 0 : U32_MAX;
- } else {
- u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
- u64 max_packed = min_packed +
- ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
- uk = __bch2_bkey_unpack_key(f, k);
- uk.p.snapshot = write
- ? min_packed : min_t(u64, U32_MAX, max_packed);
-
- BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
- }
- }
-
- break;
- case 4: {
- struct bkey_s u;
-
- if (!bkey_packed(k)) {
- u = bkey_i_to_s(packed_to_bkey(k));
- } else {
- uk = __bch2_bkey_unpack_key(f, k);
- u.k = &uk;
- u.v = bkeyp_val(f, k);
- }
-
- if (big_endian != CPU_BIG_ENDIAN)
- bch2_bkey_swab_val(u);
-
- ops = bch2_bkey_type_ops(k->type);
-
- if (ops->compat)
- ops->compat(btree_id, version, big_endian, write, u);
- break;
- }
- default:
- BUG();
- }
-}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
deleted file mode 100644
index bf34111cdf00..000000000000
--- a/fs/bcachefs/bkey_methods.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_METHODS_H
-#define _BCACHEFS_BKEY_METHODS_H
-
-#include "bkey.h"
-
-struct bch_fs;
-struct btree;
-struct btree_trans;
-struct bkey;
-enum btree_node_type;
-
-extern const char * const bch2_bkey_types[];
-extern const struct bkey_ops bch2_bkey_null_ops;
-
-/*
- * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
- * invalid, entire key will be deleted.
- *
- * When invalid, error string is returned via @err. @rw indicates whether key is
- * being read or written; more aggressive checks can be enabled when rw == WRITE.
- */
-struct bkey_ops {
- int (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from);
- void (*val_to_text)(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
- void (*swab)(struct bkey_s);
- bool (*key_normalize)(struct bch_fs *, struct bkey_s);
- bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
- int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
- void (*compat)(enum btree_id id, unsigned version,
- unsigned big_endian, int write,
- struct bkey_s);
-
- /* Size of value type when first created: */
- unsigned min_val_size;
-};
-
-extern const struct bkey_ops bch2_bkey_ops[];
-
-static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
-{
- return likely(type < KEY_TYPE_MAX)
- ? &bch2_bkey_ops[type]
- : &bch2_bkey_null_ops;
-}
-
-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
- struct bkey_validate_context from);
-
-void bch2_bpos_to_text(struct printbuf *, struct bpos);
-void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
-void bch2_val_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-
-void bch2_bkey_swab_val(struct bkey_s);
-
-bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-
-static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
-{
- return l->type == r->type &&
- !bversion_cmp(l->bversion, r->bversion) &&
- bpos_eq(l->p, bkey_start_pos(r));
-}
-
-bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-static inline int bch2_key_trigger(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
- return ops->trigger
- ? ops->trigger(trans, btree, level, old, new, flags)
- : 0;
-}
-
-static inline int bch2_key_trigger_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i deleted;
-
- bkey_init(&deleted.k);
- deleted.k.p = old.k->p;
-
- return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
- BTREE_TRIGGER_overwrite|flags);
-}
-
-static inline int bch2_key_trigger_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i deleted;
-
- bkey_init(&deleted.k);
- deleted.k.p = new.k->p;
-
- return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_insert|flags);
-}
-
-void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
-
-void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
- int, struct bkey_format *, struct bkey_packed *);
-
-static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write,
- struct bkey_format *f,
- struct bkey_packed *k)
-{
- if (version < bcachefs_metadata_version_current ||
- big_endian != CPU_BIG_ENDIAN ||
- IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- __bch2_bkey_compat(level, btree_id, version,
- big_endian, write, f, k);
-
-}
-
-#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
deleted file mode 100644
index 4536eb50fc40..000000000000
--- a/fs/bcachefs/bkey_sort.c
+++ /dev/null
@@ -1,214 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_cmp.h"
-#include "bkey_sort.h"
-#include "bset.h"
-#include "extents.h"
-
-typedef int (*sort_cmp_fn)(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-
-static inline bool sort_iter_end(struct sort_iter *iter)
-{
- return !iter->used;
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
- sort_cmp_fn cmp)
-{
- unsigned i;
-
- for (i = from;
- i + 1 < iter->used &&
- cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
- i++)
- swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- unsigned i = iter->used;
-
- while (i--)
- sort_iter_sift(iter, i, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
- return !sort_iter_end(iter) ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- struct sort_iter_set *i = iter->data;
-
- BUG_ON(!iter->used);
-
- i->k = bkey_p_next(i->k);
-
- BUG_ON(i->k > i->end);
-
- if (i->k == i->end)
- array_remove_item(iter->data, iter->used, 0);
- else
- sort_iter_sift(iter, 0, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
- sort_cmp_fn cmp)
-{
- struct bkey_packed *ret = sort_iter_peek(iter);
-
- if (ret)
- sort_iter_advance(iter, cmp);
-
- return ret;
-}
-
-/*
- * If keys compare equal, compare by pointer order:
- */
-static inline int key_sort_fix_overlapping_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed(b, l, r) ?:
- cmp_int((unsigned long) l, (unsigned long) r);
-}
-
-static inline bool should_drop_next_key(struct sort_iter *iter)
-{
- /*
- * key_sort_cmp() ensures that when keys compare equal the older key
- * comes first; so if l->k compares equal to r->k then l->k is older
- * and should be dropped.
- */
- return iter->used >= 2 &&
- !bch2_bkey_cmp_packed(iter->b,
- iter->data[0].k,
- iter->data[1].k);
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
- struct sort_iter *iter)
-{
- struct bkey_packed *out = dst->start;
- struct bkey_packed *k;
- struct btree_nr_keys nr;
-
- memset(&nr, 0, sizeof(nr));
-
- sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
-
- while ((k = sort_iter_peek(iter))) {
- if (!bkey_deleted(k) &&
- !should_drop_next_key(iter)) {
- bkey_p_copy(out, k);
- btree_keys_account_key_add(&nr, 0, out);
- out = bkey_p_next(out);
- }
-
- sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-/* Sort + repack in a new format: */
-struct btree_nr_keys
-bch2_sort_repack(struct bset *dst, struct btree *src,
- struct btree_node_iter *src_iter,
- struct bkey_format *out_f,
- bool filter_whiteouts)
-{
- struct bkey_format *in_f = &src->format;
- struct bkey_packed *in, *out = vstruct_last(dst);
- struct btree_nr_keys nr;
- bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
-
- memset(&nr, 0, sizeof(nr));
-
- while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
- if (filter_whiteouts && bkey_deleted(in))
- continue;
-
- if (!transform)
- bkey_p_copy(out, in);
- else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
- ? in_f : &bch2_bkey_format_current, in))
- out->format = KEY_FORMAT_LOCAL_BTREE;
- else
- bch2_bkey_unpack(src, (void *) out, in);
-
- out->needs_whiteout = false;
-
- btree_keys_account_key_add(&nr, 0, out);
- out = bkey_p_next(out);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-static inline int keep_unwritten_whiteouts_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
- (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
- (long) l - (long) r;
-}
-
-#include "btree_update_interior.h"
-
-/*
- * For sorting in the btree node write path: whiteouts not in the unwritten
- * whiteouts area are dropped, whiteouts in the unwritten whiteouts area are
- * dropped if overwritten by real keys:
- */
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *dst, struct sort_iter *iter)
-{
- struct bkey_packed *in, *next, *out = dst;
-
- sort_iter_sort(iter, keep_unwritten_whiteouts_cmp);
-
- while ((in = sort_iter_next(iter, keep_unwritten_whiteouts_cmp))) {
- if (bkey_deleted(in) && in < unwritten_whiteouts_start(iter->b))
- continue;
-
- if ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed_inlined(iter->b, in, next))
- continue;
-
- bkey_p_copy(out, in);
- out = bkey_p_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-/*
- * Main sort routine for compacting a btree node in memory: we always drop
- * whiteouts because any whiteouts that need to be written are in the unwritten
- * whiteouts area:
- */
-unsigned bch2_sort_keys(struct bkey_packed *dst, struct sort_iter *iter)
-{
- struct bkey_packed *in, *out = dst;
-
- sort_iter_sort(iter, bch2_bkey_cmp_packed_inlined);
-
- while ((in = sort_iter_next(iter, bch2_bkey_cmp_packed_inlined))) {
- if (bkey_deleted(in))
- continue;
-
- bkey_p_copy(out, in);
- out = bkey_p_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
deleted file mode 100644
index 9be969d46890..000000000000
--- a/fs/bcachefs/bkey_sort.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_SORT_H
-#define _BCACHEFS_BKEY_SORT_H
-
-struct sort_iter {
- struct btree *b;
- unsigned used;
- unsigned size;
-
- struct sort_iter_set {
- struct bkey_packed *k, *end;
- } data[];
-};
-
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
-{
- iter->b = b;
- iter->used = 0;
- iter->size = size;
-}
-
-struct sort_iter_stack {
- struct sort_iter iter;
- struct sort_iter_set sets[MAX_BSETS + 1];
-};
-
-static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
-{
- sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
-}
-
-static inline void sort_iter_add(struct sort_iter *iter,
- struct bkey_packed *k,
- struct bkey_packed *end)
-{
- BUG_ON(iter->used >= iter->size);
-
- if (k != end)
- iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
- struct sort_iter *);
-
-struct btree_nr_keys
-bch2_sort_repack(struct bset *, struct btree *,
- struct btree_node_iter *,
- struct bkey_format *, bool);
-
-unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
-unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
-
-#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
deleted file mode 100644
index b4f328f9853c..000000000000
--- a/fs/bcachefs/bkey_types.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_TYPES_H
-#define _BCACHEFS_BKEY_TYPES_H
-
-#include "bcachefs_format.h"
-
-/*
- * bkey_i - bkey with inline value
- * bkey_s - bkey with split value
- * bkey_s_c - bkey with split value, const
- */
-
-#define bkey_p_next(_k) vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
- return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- unsigned u64s = BKEY_U64s + val_u64s;
-
- BUG_ON(u64s > U8_MAX);
- k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...) \
-struct bkey_i_##name { \
- union { \
- struct bkey k; \
- struct bkey_i k_i; \
- }; \
- struct bch_##name v; \
-}; \
- \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = KEY_TYPE_##name; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
-enum bch_validate_flags {
- BCH_VALIDATE_write = BIT(0),
- BCH_VALIDATE_commit = BIT(1),
- BCH_VALIDATE_silent = BIT(2),
-};
-
-#define BKEY_VALIDATE_CONTEXTS() \
- x(unknown) \
- x(superblock) \
- x(journal) \
- x(btree_root) \
- x(btree_node) \
- x(commit)
-
-struct bkey_validate_context {
- enum {
-#define x(n) BKEY_VALIDATE_##n,
- BKEY_VALIDATE_CONTEXTS()
-#undef x
- } from:8;
- enum bch_validate_flags flags:8;
- u8 level;
- enum btree_id btree;
- bool root:1;
- unsigned journal_offset;
- u64 journal_seq;
-};
-
-#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
deleted file mode 100644
index 32841f762eb2..000000000000
--- a/fs/bcachefs/bset.c
+++ /dev/null
@@ -1,1576 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for working with individual keys, and sorted sets of keys with in a
- * btree node
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "bset.h"
-#include "eytzinger.h"
-#include "trace.h"
-#include "util.h"
-
-#include <linux/unaligned.h>
-#include <linux/console.h>
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
- struct btree *);
-
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
- unsigned n = ARRAY_SIZE(iter->data);
-
- while (n && __btree_node_iter_set_end(iter, n - 1))
- --n;
-
- return n;
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
-{
- return bch2_bkey_to_bset_inlined(b, k);
-}
-
-/*
- * There are never duplicate live keys in the btree - but including keys that
- * have been flagged as deleted (and will be cleaned up later) we _will_ see
- * duplicates.
- *
- * Thus the sort order is: usual key comparison first, but for keys that compare
- * equal the deleted key(s) come first, and the (at most one) live version comes
- * last.
- *
- * The main reason for this is insertion: to handle overwrites, we first iterate
- * over keys that compare equal to our insert key, and then insert immediately
- * prior to the first key greater than the key we're inserting - our insert
- * position will be after all keys that compare equal to our insert key, which
- * by the time we actually do the insert will all be deleted.
- */
-
-void bch2_dump_bset(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned set)
-{
- struct bkey_packed *_k, *_n;
- struct bkey uk, n;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
-
- if (!i->u64s)
- return;
-
- for (_k = i->start;
- _k < vstruct_last(i);
- _k = _n) {
- _n = bkey_p_next(_k);
-
- if (!_k->u64s) {
- printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
- _k->_data - i->_data);
- break;
- }
-
- k = bkey_disassemble(b, _k, &uk);
-
- printbuf_reset(&buf);
- if (c)
- bch2_bkey_val_to_text(&buf, c, k);
- else
- bch2_bkey_to_text(&buf, k.k);
- printk(KERN_ERR "block %u key %5zu: %s\n", set,
- _k->_data - i->_data, buf.buf);
-
- if (_n == vstruct_last(i))
- continue;
-
- n = bkey_unpack_key(b, _n);
-
- if (bpos_lt(n.p, k.k->p)) {
- printk(KERN_ERR "Key skipped backwards\n");
- continue;
- }
-
- if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
- printk(KERN_ERR "Duplicate keys\n");
- }
-
- printbuf_exit(&buf);
-}
-
-void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
-{
- console_lock();
- for_each_bset(b, t)
- bch2_dump_bset(c, b, bset(b, t), t - b->set);
- console_unlock();
-}
-
-void bch2_dump_btree_node_iter(struct btree *b,
- struct btree_node_iter *iter)
-{
- struct btree_node_iter_set *set;
- struct printbuf buf = PRINTBUF;
-
- printk(KERN_ERR "btree node iter with %u/%u sets:\n",
- __btree_node_iter_used(iter), b->nsets);
-
- btree_node_iter_for_each(iter, set) {
- struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
- struct bkey uk = bkey_unpack_key(b, k);
-
- printbuf_reset(&buf);
- bch2_bkey_to_text(&buf, &uk);
- printk(KERN_ERR "set %zu key %u: %s\n",
- t - b->set, set->k, buf.buf);
- }
-
- printbuf_exit(&buf);
-}
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
-{
- struct bkey_packed *k;
- struct btree_nr_keys nr = {};
-
- for_each_bset(b, t)
- bset_tree_for_each_key(b, t, k)
- if (!bkey_deleted(k))
- btree_keys_account_key_add(&nr, t - b->set, k);
- return nr;
-}
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
-{
- struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
-
- BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
-}
-
-static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
- struct btree *b)
-{
- struct btree_node_iter iter = *_iter;
- const struct bkey_packed *k, *n;
-
- k = bch2_btree_node_iter_peek_all(&iter, b);
- __bch2_btree_node_iter_advance(&iter, b);
- n = bch2_btree_node_iter_peek_all(&iter, b);
-
- bkey_unpack_key(b, k);
-
- if (n &&
- bkey_iter_cmp(b, k, n) > 0) {
- struct btree_node_iter_set *set;
- struct bkey ku = bkey_unpack_key(b, k);
- struct bkey nu = bkey_unpack_key(b, n);
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&buf1, &ku);
- bch2_bkey_to_text(&buf2, &nu);
- printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
- buf1.buf, buf2.buf);
- printk(KERN_ERR "iter was:");
-
- btree_node_iter_for_each(_iter, set) {
- struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
- struct bset_tree *t = bch2_bkey_to_bset(b, k2);
- printk(" [%zi %zi]", t - b->set,
- k2->_data - bset(b, t)->_data);
- }
- panic("\n");
- }
-}
-
-void __bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter_set *set, *s2;
- struct bkey_packed *k, *p;
-
- if (bch2_btree_node_iter_end(iter))
- return;
-
- /* Verify no duplicates: */
- btree_node_iter_for_each(iter, set) {
- BUG_ON(set->k > set->end);
- btree_node_iter_for_each(iter, s2)
- BUG_ON(set != s2 && set->end == s2->end);
- }
-
- /* Verify that set->end is correct: */
- btree_node_iter_for_each(iter, set) {
- for_each_bset(b, t)
- if (set->end == t->end_offset) {
- BUG_ON(set->k < btree_bkey_first_offset(t) ||
- set->k >= t->end_offset);
- goto found;
- }
- BUG();
-found:
- do {} while (0);
- }
-
- /* Verify iterator is sorted: */
- btree_node_iter_for_each(iter, set)
- BUG_ON(set != iter->data &&
- btree_node_iter_cmp(b, set[-1], set[0]) > 0);
-
- k = bch2_btree_node_iter_peek_all(iter, b);
-
- for_each_bset(b, t) {
- if (iter->data[0].end == t->end_offset)
- continue;
-
- p = bch2_bkey_prev_all(b, t,
- bch2_btree_node_iter_bset_pos(iter, b, t));
-
- BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
- }
-}
-
-static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
- struct bkey_packed *insert, unsigned clobber_u64s)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, where);
- struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
- struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-#if 0
- BUG_ON(prev &&
- bkey_iter_cmp(b, prev, insert) > 0);
-#else
- if (prev &&
- bkey_iter_cmp(b, prev, insert) > 0) {
- struct bkey k1 = bkey_unpack_key(b, prev);
- struct bkey k2 = bkey_unpack_key(b, insert);
-
- bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&buf1, &k1);
- bch2_bkey_to_text(&buf2, &k2);
-
- panic("prev > insert:\n"
- "prev key %s\n"
- "insert key %s\n",
- buf1.buf, buf2.buf);
- }
-#endif
-#if 0
- BUG_ON(next != btree_bkey_last(b, t) &&
- bkey_iter_cmp(b, insert, next) > 0);
-#else
- if (next != btree_bkey_last(b, t) &&
- bkey_iter_cmp(b, insert, next) > 0) {
- struct bkey k1 = bkey_unpack_key(b, insert);
- struct bkey k2 = bkey_unpack_key(b, next);
-
- bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&buf1, &k1);
- bch2_bkey_to_text(&buf2, &k2);
-
- panic("insert > next:\n"
- "insert key %s\n"
- "next key %s\n",
- buf1.buf, buf2.buf);
- }
-#endif
-}
-
-static inline void bch2_verify_insert_pos(struct btree *b,
- struct bkey_packed *where,
- struct bkey_packed *insert,
- unsigned clobber_u64s)
-{
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
- __bch2_verify_insert_pos(b, where, insert, clobber_u64s);
-}
-
-
-/* Auxiliary search trees */
-
-#define BFLOAT_FAILED_UNPACKED U8_MAX
-#define BFLOAT_FAILED U8_MAX
-
-struct bkey_float {
- u8 exponent;
- u8 key_offset;
- u16 mantissa;
-};
-#define BKEY_MANTISSA_BITS 16
-
-struct ro_aux_tree {
- u8 nothing[0];
- struct bkey_float f[];
-};
-
-struct rw_aux_tree {
- u16 offset;
- struct bpos k;
-};
-
-static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
-{
- BUG_ON(t->aux_data_offset == U16_MAX);
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- return t->aux_data_offset;
- case BSET_RO_AUX_TREE:
- return t->aux_data_offset +
- DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
- case BSET_RW_AUX_TREE:
- return t->aux_data_offset +
- DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
- default:
- BUG();
- }
-}
-
-static unsigned bset_aux_tree_buf_start(const struct btree *b,
- const struct bset_tree *t)
-{
- return t == b->set
- ? DIV_ROUND_UP(b->unpack_fn_len, 8)
- : bset_aux_tree_buf_end(t - 1);
-}
-
-static void *__aux_tree_base(const struct btree *b,
- const struct bset_tree *t)
-{
- return b->aux_data + t->aux_data_offset * 8;
-}
-
-static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t);
-}
-
-static struct bkey_float *bkey_float(const struct btree *b,
- const struct bset_tree *t,
- unsigned idx)
-{
- return ro_aux_tree_base(b, t)->f + idx;
-}
-
-static void __bset_aux_tree_verify(struct btree *b)
-{
- for_each_bset(b, t) {
- if (t->aux_data_offset == U16_MAX)
- continue;
-
- BUG_ON(t != b->set &&
- t[-1].aux_data_offset == U16_MAX);
-
- BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
- BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
- BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
- }
-}
-
-static inline void bset_aux_tree_verify(struct btree *b)
-{
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
- __bset_aux_tree_verify(b);
-}
-
-void bch2_btree_keys_init(struct btree *b)
-{
- unsigned i;
-
- b->nsets = 0;
- memset(&b->nr, 0, sizeof(b->nr));
-
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].data_offset = U16_MAX;
-
- bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-/* Binary tree stuff for auxiliary search trees */
-
-/*
- * Cacheline/offset <-> bkey pointer arithmetic:
- *
- * t->tree is a binary search tree in an array; each node corresponds to a key
- * in one cacheline in t->set (BSET_CACHELINE bytes).
- *
- * This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
- * then bkey_float->m gives us the offset within that cacheline, in units of 8
- * bytes.
- *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
- * make this work.
- *
- * To construct the bfloat for an arbitrary key we need to know what the key
- * immediately preceding it is: we have to check if the two keys differ in the
- * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
- * of the previous key so we can walk backwards to it from t->tree[j]'s key.
- */
-
-static inline void *bset_cacheline(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline)
-{
- return (void *) round_down((unsigned long) btree_bkey_first(b, t),
- L1_CACHE_BYTES) +
- cacheline * BSET_CACHELINE;
-}
-
-static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- unsigned offset)
-{
- return bset_cacheline(b, t, cacheline) + offset * 8;
-}
-
-static unsigned bkey_to_cacheline(const struct btree *b,
- const struct bset_tree *t,
- const struct bkey_packed *k)
-{
- return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
-}
-
-static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- const struct bkey_packed *k)
-{
- return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
-}
-
-static unsigned bkey_to_cacheline_offset(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- const struct bkey_packed *k)
-{
- size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
-
- EBUG_ON(m > U8_MAX);
- return m;
-}
-
-static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- return cacheline_to_bkey(b, t,
- __eytzinger1_to_inorder(j, t->size - 1, t->extra),
- bkey_float(b, t, j)->key_offset);
-}
-
-static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
- return __aux_tree_base(b, t);
-}
-
-/*
- * For the write set - the one we're currently inserting keys into - we don't
- * maintain a full search tree, we just keep a simple lookup table in t->prev.
- */
-static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
- struct bset_tree *t,
- unsigned j)
-{
- return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
-}
-
-static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
- unsigned j, struct bkey_packed *k)
-{
- EBUG_ON(k >= btree_bkey_last(b, t));
-
- rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
- .offset = __btree_node_key_to_offset(b, k),
- .k = bkey_unpack_pos(b, k),
- };
-}
-
-static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *k = btree_bkey_first(b, t);
- unsigned j = 0;
-
- BUG_ON(bset_has_ro_aux_tree(t));
-
- if (!bset_has_rw_aux_tree(t))
- return;
-
- BUG_ON(t->size < 1);
- BUG_ON(rw_aux_to_bkey(b, t, j) != k);
-
- goto start;
- while (1) {
- if (rw_aux_to_bkey(b, t, j) == k) {
- BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
- bkey_unpack_pos(b, k)));
-start:
- if (++j == t->size)
- break;
-
- BUG_ON(rw_aux_tree(b, t)[j].offset <=
- rw_aux_tree(b, t)[j - 1].offset);
- }
-
- k = bkey_p_next(k);
- BUG_ON(k >= btree_bkey_last(b, t));
- }
-}
-
-static inline void bch2_bset_verify_rw_aux_tree(struct btree *b,
- struct bset_tree *t)
-{
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
- __bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-/* returns idx of first entry >= offset: */
-static unsigned rw_aux_tree_bsearch(struct btree *b,
- struct bset_tree *t,
- unsigned offset)
-{
- unsigned bset_offs = offset - btree_bkey_first_offset(t);
- unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
- unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
-
- EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
- EBUG_ON(!t->size);
- EBUG_ON(idx > t->size);
-
- while (idx < t->size &&
- rw_aux_tree(b, t)[idx].offset < offset)
- idx++;
-
- while (idx &&
- rw_aux_tree(b, t)[idx - 1].offset >= offset)
- idx--;
-
- EBUG_ON(idx < t->size &&
- rw_aux_tree(b, t)[idx].offset < offset);
- EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
- EBUG_ON(idx + 1 < t->size &&
- rw_aux_tree(b, t)[idx].offset ==
- rw_aux_tree(b, t)[idx + 1].offset);
-
- return idx;
-}
-
-static inline unsigned bkey_mantissa(const struct bkey_packed *k,
- const struct bkey_float *f)
-{
- u64 v;
-
- EBUG_ON(!bkey_packed(k));
-
- v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
-
- /*
- * In little endian, we're shifting off low bits (and then the bits we
- * want are at the low end), in big endian we're shifting off high bits
- * (and then the bits we want are at the high end, so we shift them
- * back down):
- */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- v >>= f->exponent & 7;
-#else
- v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
-#endif
- return (u16) v;
-}
-
-static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
-{
- struct bkey_float *f = bkey_float(b, t, j);
- struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *l = is_power_of_2(j)
- ? min_key
- : tree_to_bkey(b, t, j >> ffs(j));
- struct bkey_packed *r = is_power_of_2(j + 1)
- ? max_key
- : tree_to_bkey(b, t, j >> (ffz(j) + 1));
- unsigned mantissa;
- int shift, exponent, high_bit;
-
- /*
- * for failed bfloats, the lookup code falls back to comparing against
- * the original key.
- */
-
- if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
- !b->nr_key_bits) {
- f->exponent = BFLOAT_FAILED_UNPACKED;
- return;
- }
-
- /*
- * The greatest differing bit of l and r is the first bit we must
- * include in the bfloat mantissa we're creating in order to do
- * comparisons - that bit always becomes the high bit of
- * bfloat->mantissa, and thus the exponent we're calculating here is
- * the position of what will become the low bit in bfloat->mantissa:
- *
- * Note that this may be negative - we may be running off the low end
- * of the key: we handle this later:
- */
- high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
- min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
- exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
-
- /*
- * Then we calculate the actual shift value, from the start of the key
- * (k->_data), to get the key bits starting at exponent:
- */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
-
- EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
-#else
- shift = high_bit_offset +
- b->nr_key_bits -
- exponent -
- BKEY_MANTISSA_BITS;
-
- EBUG_ON(shift < KEY_PACKED_BITS_START);
-#endif
- EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
-
- f->exponent = shift;
- mantissa = bkey_mantissa(m, f);
-
- /*
- * If we've got garbage bits, set them to all 1s - it's legal for the
- * bfloat to compare larger than the original key, but not smaller:
- */
- if (exponent < 0)
- mantissa |= ~(~0U << -exponent);
-
- f->mantissa = mantissa;
-}
-
-/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
- bset_aux_tree_verify(b);
-
- return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
-}
-
-static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
- return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
-}
-
-static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
-{
- return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
-}
-
-static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *k;
-
- t->size = 1;
- t->extra = BSET_RW_AUX_TREE_VAL;
- rw_aux_tree(b, t)[0].offset =
- __btree_node_key_to_offset(b, btree_bkey_first(b, t));
-
- bset_tree_for_each_key(b, t, k) {
- if (t->size == bset_rw_tree_capacity(b, t))
- break;
-
- if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
- L1_CACHE_BYTES)
- rw_aux_tree_set(b, t, t->size++, k);
- }
-}
-
-static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *k = btree_bkey_first(b, t);
- struct bkey_i min_key, max_key;
- unsigned cacheline = 1;
-
- t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
- bset_ro_tree_capacity(b, t));
-retry:
- if (t->size < 2) {
- t->size = 0;
- t->extra = BSET_NO_AUX_TREE_VAL;
- return;
- }
-
- t->extra = eytzinger1_extra(t->size - 1);
-
- /* First we figure out where the first key in each cacheline is */
- eytzinger1_for_each(j, t->size - 1) {
- while (bkey_to_cacheline(b, t, k) < cacheline)
- k = bkey_p_next(k);
-
- if (k >= btree_bkey_last(b, t)) {
- /* XXX: this path sucks */
- t->size--;
- goto retry;
- }
-
- bkey_float(b, t, j)->key_offset =
- bkey_to_cacheline_offset(b, t, cacheline++, k);
-
- EBUG_ON(tree_to_bkey(b, t, j) != k);
- }
-
- if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
- bkey_init(&min_key.k);
- min_key.k.p = b->data->min_key;
- }
-
- if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
- bkey_init(&max_key.k);
- max_key.k.p = b->data->max_key;
- }
-
- /* Then we build the tree */
- eytzinger1_for_each(j, t->size - 1)
- make_bfloat(b, t, j,
- bkey_to_packed(&min_key),
- bkey_to_packed(&max_key));
-}
-
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-{
- struct bset_tree *i;
-
- for (i = b->set; i != t; i++)
- BUG_ON(bset_has_rw_aux_tree(i));
-
- bch2_bset_set_no_aux_tree(b, t);
-
- /* round up to next cacheline: */
- t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
- SMP_CACHE_BYTES / sizeof(u64));
-
- bset_aux_tree_verify(b);
-}
-
-void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
- bool writeable)
-{
- if (writeable
- ? bset_has_rw_aux_tree(t)
- : bset_has_ro_aux_tree(t))
- return;
-
- bset_alloc_tree(b, t);
-
- if (!__bset_tree_capacity(b, t))
- return;
-
- if (writeable)
- __build_rw_aux_tree(b, t);
- else
- __build_ro_aux_tree(b, t);
-
- bset_aux_tree_verify(b);
-}
-
-void bch2_bset_init_first(struct btree *b, struct bset *i)
-{
- struct bset_tree *t;
-
- BUG_ON(b->nsets);
-
- memset(i, 0, sizeof(*i));
- get_random_bytes(&i->seq, sizeof(i->seq));
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- t = &b->set[b->nsets++];
- set_btree_bset(b, t, i);
-}
-
-void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
-{
- struct bset *i = &bne->keys;
- struct bset_tree *t;
-
- BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
- BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
- BUG_ON(b->nsets >= MAX_BSETS);
-
- memset(i, 0, sizeof(*i));
- i->seq = btree_bset_first(b)->seq;
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- t = &b->set[b->nsets++];
- set_btree_bset(b, t, i);
-}
-
-/*
- * find _some_ key in the same bset as @k that precedes @k - not necessarily the
- * immediate predecessor:
- */
-static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed *p;
- unsigned offset;
- int j;
-
- EBUG_ON(k < btree_bkey_first(b, t) ||
- k > btree_bkey_last(b, t));
-
- if (k == btree_bkey_first(b, t))
- return NULL;
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- p = btree_bkey_first(b, t);
- break;
- case BSET_RO_AUX_TREE:
- j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
-
- do {
- p = j ? tree_to_bkey(b, t,
- __inorder_to_eytzinger1(j--,
- t->size - 1, t->extra))
- : btree_bkey_first(b, t);
- } while (p >= k);
- break;
- case BSET_RW_AUX_TREE:
- offset = __btree_node_key_to_offset(b, k);
- j = rw_aux_tree_bsearch(b, t, offset);
- p = j ? rw_aux_to_bkey(b, t, j - 1)
- : btree_bkey_first(b, t);
- break;
- }
-
- return p;
-}
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k,
- unsigned min_key_type)
-{
- struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
-
- while ((p = __bkey_prev(b, t, k)) && !ret) {
- for (i = p; i != k; i = bkey_p_next(i))
- if (i->type >= min_key_type)
- ret = i;
-
- k = p;
- }
-
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
- BUG_ON(ret >= orig_k);
-
- for (i = ret
- ? bkey_p_next(ret)
- : btree_bkey_first(b, t);
- i != orig_k;
- i = bkey_p_next(i))
- BUG_ON(i->type >= min_key_type);
- }
-
- return ret;
-}
-
-/* Insert */
-
-static void rw_aux_tree_insert_entry(struct btree *b,
- struct bset_tree *t,
- unsigned idx)
-{
- EBUG_ON(!idx || idx > t->size);
- struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
- struct bkey_packed *end = idx < t->size
- ? rw_aux_to_bkey(b, t, idx)
- : btree_bkey_last(b, t);
-
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (void *) end - (void *) start > L1_CACHE_BYTES) {
- struct bkey_packed *k = start;
-
- while (1) {
- k = bkey_p_next(k);
- if (k == end)
- break;
-
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[idx + 1],
- &rw_aux_tree(b, t)[idx],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[idx]);
- t->size++;
- rw_aux_tree_set(b, t, idx, k);
- break;
- }
- }
- }
-}
-
-static void bch2_bset_fix_lookup_table(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *_where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- int shift = new_u64s - clobber_u64s;
- unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
-
- EBUG_ON(bset_has_ro_aux_tree(t));
-
- if (!bset_has_rw_aux_tree(t))
- return;
-
- if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
- rw_aux_tree_insert_entry(b, t, t->size);
- goto verify;
- }
-
- /* returns first entry >= where */
- idx = rw_aux_tree_bsearch(b, t, where);
-
- if (rw_aux_tree(b, t)[idx].offset == where) {
- if (!idx) { /* never delete first entry */
- idx++;
- } else if (where < t->end_offset) {
- rw_aux_tree_set(b, t, idx++, _where);
- } else {
- EBUG_ON(where != t->end_offset);
- rw_aux_tree_insert_entry(b, t, --t->size);
- goto verify;
- }
- }
-
- EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
- if (idx < t->size &&
- rw_aux_tree(b, t)[idx].offset + shift ==
- rw_aux_tree(b, t)[idx - 1].offset) {
- memmove(&rw_aux_tree(b, t)[idx],
- &rw_aux_tree(b, t)[idx + 1],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[idx + 1]);
- t->size -= 1;
- }
-
- for (j = idx; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
-
- EBUG_ON(idx < t->size &&
- rw_aux_tree(b, t)[idx].offset ==
- rw_aux_tree(b, t)[idx - 1].offset);
-
- rw_aux_tree_insert_entry(b, t, idx);
-
-verify:
- bch2_bset_verify_rw_aux_tree(b, t);
- bset_aux_tree_verify(b);
-}
-
-void bch2_bset_insert(struct btree *b,
- struct bkey_packed *where,
- struct bkey_i *insert,
- unsigned clobber_u64s)
-{
- struct bkey_format *f = &b->format;
- struct bset_tree *t = bset_tree_last(b);
- struct bkey_packed packed, *src = bkey_to_packed(insert);
-
- bch2_bset_verify_rw_aux_tree(b, t);
- bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
-
- if (bch2_bkey_pack_key(&packed, &insert->k, f))
- src = &packed;
-
- if (!bkey_deleted(&insert->k))
- btree_keys_account_key_add(&b->nr, t - b->set, src);
-
- if (src->u64s != clobber_u64s) {
- u64 *src_p = (u64 *) where->_data + clobber_u64s;
- u64 *dst_p = (u64 *) where->_data + src->u64s;
-
- EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
- (int) clobber_u64s - src->u64s);
-
- memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
- le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
- set_btree_bset_end(b, t);
- }
-
- memcpy_u64s_small(where, src,
- bkeyp_key_u64s(f, src));
- memcpy_u64s(bkeyp_val(f, where), &insert->v,
- bkeyp_val_u64s(f, src));
-
- if (src->u64s != clobber_u64s)
- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
-
- bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_bset_delete(struct btree *b,
- struct bkey_packed *where,
- unsigned clobber_u64s)
-{
- struct bset_tree *t = bset_tree_last(b);
- u64 *src_p = (u64 *) where->_data + clobber_u64s;
- u64 *dst_p = where->_data;
-
- bch2_bset_verify_rw_aux_tree(b, t);
-
- EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
-
- memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
- le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
- set_btree_bset_end(b, t);
-
- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
-}
-
-/* Lookup */
-
-__flatten
-static struct bkey_packed *bset_search_write_set(const struct btree *b,
- struct bset_tree *t,
- struct bpos *search)
-{
- unsigned l = 0, r = t->size;
-
- while (l + 1 != r) {
- unsigned m = (l + r) >> 1;
-
- if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
- l = m;
- else
- r = m;
- }
-
- return rw_aux_to_bkey(b, t, l);
-}
-
-static inline void prefetch_four_cachelines(void *p)
-{
-#ifdef CONFIG_X86_64
- asm("prefetcht0 (-127 + 64 * 0)(%0);"
- "prefetcht0 (-127 + 64 * 1)(%0);"
- "prefetcht0 (-127 + 64 * 2)(%0);"
- "prefetcht0 (-127 + 64 * 3)(%0);"
- :
- : "r" (p + 127));
-#else
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- prefetch(p + L1_CACHE_BYTES * 3);
-#endif
-}
-
-static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
- const struct bkey_float *f)
-{
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
-
- return f->exponent > key_bits_start;
-#else
- unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
-
- return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
-#endif
-}
-
-__flatten
-static struct bkey_packed *bset_search_tree(const struct btree *b,
- const struct bset_tree *t,
- const struct bpos *search,
- const struct bkey_packed *packed_search)
-{
- struct ro_aux_tree *base = ro_aux_tree_base(b, t);
- struct bkey_float *f;
- struct bkey_packed *k;
- unsigned inorder, n = 1, l, r;
- int cmp;
-
- do {
- if (likely(n << 4 < t->size))
- prefetch(&base->f[n << 4]);
-
- f = &base->f[n];
- if (unlikely(f->exponent >= BFLOAT_FAILED))
- goto slowpath;
-
- l = f->mantissa;
- r = bkey_mantissa(packed_search, f);
-
- if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
- goto slowpath;
-
- n = n * 2 + (l < r);
- continue;
-slowpath:
- k = tree_to_bkey(b, t, n);
- cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
- if (!cmp)
- return k;
-
- n = n * 2 + (cmp < 0);
- } while (n < t->size);
-
- inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
-
- /*
- * n would have been the node we recursed to - the low bit tells us if
- * we recursed left or recursed right.
- */
- if (likely(!(n & 1))) {
- --inorder;
- if (unlikely(!inorder))
- return btree_bkey_first(b, t);
-
- f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
- }
-
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
-}
-
-static __always_inline __flatten
-struct bkey_packed *__bch2_bset_search(struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
- const struct bkey_packed *lossy_packed_search)
-{
-
- /*
- * First, we search for a cacheline, then lastly we do a linear search
- * within that cacheline.
- *
- * To search for the cacheline, there's three different possibilities:
- * * The set is too small to have a search tree, so we just do a linear
- * search over the whole set.
- * * The set is the one we're currently inserting into; keeping a full
- * auxiliary search tree up to date would be too expensive, so we
- * use a much simpler lookup table to do a binary search -
- * bset_search_write_set().
- * * Or we use the auxiliary search tree we constructed earlier -
- * bset_search_tree()
- */
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- return btree_bkey_first(b, t);
- case BSET_RW_AUX_TREE:
- return bset_search_write_set(b, t, search);
- case BSET_RO_AUX_TREE:
- return bset_search_tree(b, t, search, lossy_packed_search);
- default:
- BUG();
- }
-}
-
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search_linear(struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
- struct bkey_packed *packed_search,
- const struct bkey_packed *lossy_packed_search,
- struct bkey_packed *m)
-{
- if (lossy_packed_search)
- while (m != btree_bkey_last(b, t) &&
- bkey_iter_cmp_p_or_unp(b, m,
- lossy_packed_search, search) < 0)
- m = bkey_p_next(m);
-
- if (!packed_search)
- while (m != btree_bkey_last(b, t) &&
- bkey_iter_pos_cmp(b, m, search) < 0)
- m = bkey_p_next(m);
-
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
- struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
- BUG_ON(prev &&
- bkey_iter_cmp_p_or_unp(b, prev,
- packed_search, search) >= 0);
- }
-
- return m;
-}
-
-/* Btree node iterator */
-
-static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- if (k != end) {
- struct btree_node_iter_set *pos;
-
- btree_node_iter_for_each(iter, pos)
- ;
-
- BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
- *pos = (struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k),
- __btree_node_key_to_offset(b, end)
- };
- }
-}
-
-void bch2_btree_node_iter_push(struct btree_node_iter *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- __bch2_btree_node_iter_push(iter, b, k, end);
- bch2_btree_node_iter_sort(iter, b);
-}
-
-noinline __flatten __cold
-static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
- struct btree *b, struct bpos *search)
-{
- struct bkey_packed *k;
-
- trace_bkey_pack_pos_fail(search);
-
- bch2_btree_node_iter_init_from_start(iter, b);
-
- while ((k = bch2_btree_node_iter_peek(iter, b)) &&
- bkey_iter_pos_cmp(b, k, search) < 0)
- bch2_btree_node_iter_advance(iter, b);
-}
-
-/**
- * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
- * given position
- *
- * @iter: iterator to initialize
- * @b: btree node to search
- * @search: search key
- *
- * Main entry point to the lookup code for individual btree nodes:
- *
- * NOTE:
- *
- * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
- * keys. This doesn't matter for most code, but it does matter for lookups.
- *
- * Some adjacent keys with a string of equal keys:
- * i j k k k k l m
- *
- * If you search for k, the lookup code isn't guaranteed to return you any
- * specific k. The lookup code is conceptually doing a binary search and
- * iterating backwards is very expensive so if the pivot happens to land at the
- * last k that's what you'll get.
- *
- * This works out ok, but it's something to be aware of:
- *
- * - For non extents, we guarantee that the live key comes last - see
- * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
- * see will only be deleted keys you don't care about.
- *
- * - For extents, deleted keys sort last (see the comment at the top of this
- * file). But when you're searching for extents, you actually want the first
- * key strictly greater than your search key - an extent that compares equal
- * to the search key is going to have 0 sectors after the search key.
- *
- * But this does mean that we can't just search for
- * bpos_successor(start_of_range) to get the first extent that overlaps with
- * the range we want - if we're unlucky and there's an extent that ends
- * exactly where we searched, then there could be a deleted key at the same
- * position and we'd get that when we search instead of the preceding extent
- * we needed.
- *
- * So we've got to search for start_of_range, then after the lookup iterate
- * past any extents that compare equal to the position we searched for.
- */
-__flatten
-void bch2_btree_node_iter_init(struct btree_node_iter *iter,
- struct btree *b, struct bpos *search)
-{
- struct bkey_packed p, *packed_search = NULL;
- struct btree_node_iter_set *pos = iter->data;
- struct bkey_packed *k[MAX_BSETS];
- unsigned i;
-
- EBUG_ON(bpos_lt(*search, b->data->min_key));
- EBUG_ON(bpos_gt(*search, b->data->max_key));
- bset_aux_tree_verify(b);
-
- memset(iter, 0, sizeof(*iter));
-
- switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
- case BKEY_PACK_POS_EXACT:
- packed_search = &p;
- break;
- case BKEY_PACK_POS_SMALLER:
- packed_search = NULL;
- break;
- case BKEY_PACK_POS_FAIL:
- btree_node_iter_init_pack_failed(iter, b, search);
- return;
- }
-
- for (i = 0; i < b->nsets; i++) {
- k[i] = __bch2_bset_search(b, b->set + i, search, &p);
- prefetch_four_cachelines(k[i]);
- }
-
- for (i = 0; i < b->nsets; i++) {
- struct bset_tree *t = b->set + i;
- struct bkey_packed *end = btree_bkey_last(b, t);
-
- k[i] = bch2_bset_search_linear(b, t, search,
- packed_search, &p, k[i]);
- if (k[i] != end)
- *pos++ = (struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k[i]),
- __btree_node_key_to_offset(b, end)
- };
- }
-
- bch2_btree_node_iter_sort(iter, b);
-}
-
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
- struct btree *b)
-{
- memset(iter, 0, sizeof(*iter));
-
- for_each_bset(b, t)
- __bch2_btree_node_iter_push(iter, b,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- bch2_btree_node_iter_sort(iter, b);
-}
-
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
- struct btree *b,
- struct bset_tree *t)
-{
- struct btree_node_iter_set *set;
-
- btree_node_iter_for_each(iter, set)
- if (set->end == t->end_offset)
- return __btree_node_offset_to_key(b, set->k);
-
- return btree_bkey_last(b, t);
-}
-
-static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
- struct btree *b,
- unsigned first)
-{
- bool ret;
-
- if ((ret = (btree_node_iter_cmp(b,
- iter->data[first],
- iter->data[first + 1]) > 0)))
- swap(iter->data[first], iter->data[first + 1]);
- return ret;
-}
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
- struct btree *b)
-{
- /* unrolled bubble sort: */
-
- if (!__btree_node_iter_set_end(iter, 2)) {
- btree_node_iter_sort_two(iter, b, 0);
- btree_node_iter_sort_two(iter, b, 1);
- }
-
- if (!__btree_node_iter_set_end(iter, 1))
- btree_node_iter_sort_two(iter, b, 0);
-}
-
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
- struct btree_node_iter_set *set)
-{
- struct btree_node_iter_set *last =
- iter->data + ARRAY_SIZE(iter->data) - 1;
-
- memmove(&set[0], &set[1], (void *) last - (void *) set);
- *last = (struct btree_node_iter_set) { 0, 0 };
-}
-
-static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
- struct btree *b)
-{
- iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
-
- EBUG_ON(iter->data->k > iter->data->end);
-
- if (unlikely(__btree_node_iter_set_end(iter, 0))) {
- /* avoid an expensive memmove call: */
- iter->data[0] = iter->data[1];
- iter->data[1] = iter->data[2];
- iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
- return;
- }
-
- if (__btree_node_iter_set_end(iter, 1))
- return;
-
- if (!btree_node_iter_sort_two(iter, b, 0))
- return;
-
- if (__btree_node_iter_set_end(iter, 2))
- return;
-
- btree_node_iter_sort_two(iter, b, 1);
-}
-
-void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
- struct btree *b)
-{
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
- __bch2_btree_node_iter_verify(iter, b);
- __bch2_btree_node_iter_next_check(iter, b);
- }
-
- __bch2_btree_node_iter_advance(iter, b);
-}
-
-/*
- * Expensive:
- */
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *k, *prev = NULL;
- struct btree_node_iter_set *set;
- unsigned end = 0;
-
- bch2_btree_node_iter_verify(iter, b);
-
- for_each_bset(b, t) {
- k = bch2_bkey_prev_all(b, t,
- bch2_btree_node_iter_bset_pos(iter, b, t));
- if (k &&
- (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
- prev = k;
- end = t->end_offset;
- }
- }
-
- if (!prev)
- return NULL;
-
- /*
- * We're manually memmoving instead of just calling sort() to ensure the
- * prev we picked ends up in slot 0 - sort won't necessarily put it
- * there because of duplicate deleted keys:
- */
- btree_node_iter_for_each(iter, set)
- if (set->end == end)
- goto found;
-
- BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
-found:
- BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
-
- memmove(&iter->data[1],
- &iter->data[0],
- (void *) set - (void *) &iter->data[0]);
-
- iter->data[0].k = __btree_node_key_to_offset(b, prev);
- iter->data[0].end = end;
-
- bch2_btree_node_iter_verify(iter, b);
- return prev;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *prev;
-
- do {
- prev = bch2_btree_node_iter_prev_all(iter, b);
- } while (prev && bkey_deleted(prev));
-
- return prev;
-}
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
- struct btree *b,
- struct bkey *u)
-{
- struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
-
- return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
-}
-
-/* Mergesort */
-
-void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
-{
- for_each_bset_c(b, t) {
- enum bset_aux_tree_type type = bset_aux_tree_type(t);
- size_t j;
-
- stats->sets[type].nr++;
- stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
- sizeof(u64);
-
- if (bset_has_ro_aux_tree(t)) {
- stats->floats += t->size - 1;
-
- for (j = 1; j < t->size; j++)
- stats->failed +=
- bkey_float(b, t, j)->exponent ==
- BFLOAT_FAILED;
- }
- }
-}
-
-void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
- struct bkey_packed *k)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
- struct bkey uk;
- unsigned j, inorder;
-
- if (!bset_has_ro_aux_tree(t))
- return;
-
- inorder = bkey_to_cacheline(b, t, k);
- if (!inorder || inorder >= t->size)
- return;
-
- j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
- if (k != tree_to_bkey(b, t, j))
- return;
-
- switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED:
- uk = bkey_unpack_key(b, k);
- prt_printf(out,
- " failed unpacked at depth %u\n"
- "\t",
- ilog2(j));
- bch2_bpos_to_text(out, uk.p);
- prt_printf(out, "\n");
- break;
- }
-}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
deleted file mode 100644
index a15ecf9d006e..000000000000
--- a/fs/bcachefs/bset.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BSET_H
-#define _BCACHEFS_BSET_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "util.h" /* for time_stats */
-#include "vstructs.h"
-
-/*
- * BKEYS:
- *
- * A bkey contains a key, a size field, a variable number of pointers, and some
- * ancillary flag bits.
- *
- * We use two different functions for validating bkeys, bkey_invalid and
- * bkey_deleted().
- *
- * The one exception to the rule that ptr_invalid() filters out invalid keys is
- * that it also filters out keys of size 0 - these are keys that have been
- * completely overwritten. It'd be safe to delete these in memory while leaving
- * them on disk, just unnecessary work - so we filter them out when resorting
- * instead.
- *
- * We can't filter out stale keys when we're resorting, because garbage
- * collection needs to find them to ensure bucket gens don't wrap around -
- * unless we're rewriting the btree node those stale keys still exist on disk.
- *
- * We also implement functions here for removing some number of sectors from the
- * front or the back of a bkey - this is mainly used for fixing overlapping
- * extents, by removing the overlapping sectors from the older key.
- *
- * BSETS:
- *
- * A bset is an array of bkeys laid out contiguously in memory in sorted order,
- * along with a header. A btree node is made up of a number of these, written at
- * different times.
- *
- * There could be many of them on disk, but we never allow there to be more than
- * 4 in memory - we lazily resort as needed.
- *
- * We implement code here for creating and maintaining auxiliary search trees
- * (described below) for searching an individial bset, and on top of that we
- * implement a btree iterator.
- *
- * BTREE ITERATOR:
- *
- * Most of the code in bcache doesn't care about an individual bset - it needs
- * to search entire btree nodes and iterate over them in sorted order.
- *
- * The btree iterator code serves both functions; it iterates through the keys
- * in a btree node in sorted order, starting from either keys after a specific
- * point (if you pass it a search key) or the start of the btree node.
- *
- * AUXILIARY SEARCH TREES:
- *
- * Since keys are variable length, we can't use a binary search on a bset - we
- * wouldn't be able to find the start of the next key. But binary searches are
- * slow anyways, due to terrible cache behaviour; bcache originally used binary
- * searches and that code topped out at under 50k lookups/second.
- *
- * So we need to construct some sort of lookup table. Since we only insert keys
- * into the last (unwritten) set, most of the keys within a given btree node are
- * usually in sets that are mostly constant. We use two different types of
- * lookup tables to take advantage of this.
- *
- * Both lookup tables share in common that they don't index every key in the
- * set; they index one key every BSET_CACHELINE bytes, and then a linear search
- * is used for the rest.
- *
- * For sets that have been written to disk and are no longer being inserted
- * into, we construct a binary search tree in an array - traversing a binary
- * search tree in an array gives excellent locality of reference and is very
- * fast, since both children of any node are adjacent to each other in memory
- * (and their grandchildren, and great grandchildren...) - this means
- * prefetching can be used to great effect.
- *
- * It's quite useful performance wise to keep these nodes small - not just
- * because they're more likely to be in L2, but also because we can prefetch
- * more nodes on a single cacheline and thus prefetch more iterations in advance
- * when traversing this tree.
- *
- * Nodes in the auxiliary search tree must contain both a key to compare against
- * (we don't want to fetch the key from the set, that would defeat the purpose),
- * and a pointer to the key. We use a few tricks to compress both of these.
- *
- * To compress the pointer, we take advantage of the fact that one node in the
- * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
- * a function (to_inorder()) that takes the index of a node in a binary tree and
- * returns what its index would be in an inorder traversal, so we only have to
- * store the low bits of the offset.
- *
- * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
- * compress that, we take advantage of the fact that when we're traversing the
- * search tree at every iteration we know that both our search key and the key
- * we're looking for lie within some range - bounded by our previous
- * comparisons. (We special case the start of a search so that this is true even
- * at the root of the tree).
- *
- * So we know the key we're looking for is between a and b, and a and b don't
- * differ higher than bit 50, we don't need to check anything higher than bit
- * 50.
- *
- * We don't usually need the rest of the bits, either; we only need enough bits
- * to partition the key range we're currently checking. Consider key n - the
- * key our auxiliary search tree node corresponds to, and key p, the key
- * immediately preceding n. The lowest bit we need to store in the auxiliary
- * search tree is the highest bit that differs between n and p.
- *
- * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
- * comparison. But we'd really like our nodes in the auxiliary search tree to be
- * of fixed size.
- *
- * The solution is to make them fixed size, and when we're constructing a node
- * check if p and n differed in the bits we needed them to. If they don't we
- * flag that node, and when doing lookups we fallback to comparing against the
- * real key. As long as this doesn't happen to often (and it seems to reliably
- * happen a bit less than 1% of the time), we win - even on failures, that key
- * is then more likely to be in cache than if we were doing binary searches all
- * the way, since we're touching so much less memory.
- *
- * The keys in the auxiliary search tree are stored in (software) floating
- * point, with an exponent and a mantissa. The exponent needs to be big enough
- * to address all the bits in the original key, but the number of bits in the
- * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
- *
- * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
- * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
- * We need one node per 128 bytes in the btree node, which means the auxiliary
- * search trees take up 3% as much memory as the btree itself.
- *
- * Constructing these auxiliary search trees is moderately expensive, and we
- * don't want to be constantly rebuilding the search tree for the last set
- * whenever we insert another key into it. For the unwritten set, we use a much
- * simpler lookup table - it's just a flat array, so index i in the lookup table
- * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
- * within each byte range works the same as with the auxiliary search trees.
- *
- * These are much easier to keep up to date when we insert a key - we do it
- * somewhat lazily; when we shift a key up we usually just increment the pointer
- * to it, only when it would overflow do we go to the trouble of finding the
- * first key in that range of bytes again.
- */
-
-enum bset_aux_tree_type {
- BSET_NO_AUX_TREE,
- BSET_RO_AUX_TREE,
- BSET_RW_AUX_TREE,
-};
-
-#define BSET_TREE_NR_TYPES 3
-
-#define BSET_NO_AUX_TREE_VAL (U16_MAX)
-#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
-
-static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
-{
- switch (t->extra) {
- case BSET_NO_AUX_TREE_VAL:
- EBUG_ON(t->size);
- return BSET_NO_AUX_TREE;
- case BSET_RW_AUX_TREE_VAL:
- EBUG_ON(!t->size);
- return BSET_RW_AUX_TREE;
- default:
- EBUG_ON(!t->size);
- return BSET_RO_AUX_TREE;
- }
-}
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE 256
-
-static inline size_t btree_keys_cachelines(const struct btree *b)
-{
- return (1U << b->byte_order) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(const struct btree *b)
-{
- return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(const struct btree *b)
-{
- return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
-#define for_each_bset(_b, _t) \
- for (struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define for_each_bset_c(_b, _t) \
- for (const struct bset_tree *_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-#define bset_tree_for_each_key(_b, _t, _k) \
- for (_k = btree_bkey_first(_b, _t); \
- _k != btree_bkey_last(_b, _t); \
- _k = bkey_p_next(_k))
-
-static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
-{
- return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
-}
-
-static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
-{
- return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
-}
-
-static inline void bch2_bset_set_no_aux_tree(struct btree *b,
- struct bset_tree *t)
-{
- BUG_ON(t < b->set);
-
- for (; t < b->set + ARRAY_SIZE(b->set); t++) {
- t->size = 0;
- t->extra = BSET_NO_AUX_TREE_VAL;
- t->aux_data_offset = U16_MAX;
- }
-}
-
-static inline void btree_node_set_format(struct btree *b,
- struct bkey_format f)
-{
- int len;
-
- b->format = f;
- b->nr_key_bits = bkey_format_key_bits(&f);
-
- len = bch2_compile_bkey_format(&b->format, b->aux_data);
- BUG_ON(len < 0 || len > U8_MAX);
-
- b->unpack_fn_len = len;
-
- bch2_bset_set_no_aux_tree(b, b->set);
-}
-
-static inline struct bset *bset_next_set(struct btree *b,
- unsigned block_bytes)
-{
- struct bset *i = btree_bset_last(b);
-
- EBUG_ON(!is_power_of_2(block_bytes));
-
- return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
-}
-
-void bch2_btree_keys_init(struct btree *);
-
-void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
-void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-
-void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
- unsigned);
-void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
-
-/* Bkey utility code */
-
-/* packed or unpacked */
-static inline int bkey_cmp_p_or_unp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r_packed,
- const struct bpos *r)
-{
- EBUG_ON(r_packed && !bkey_packed(r_packed));
-
- if (unlikely(!bkey_packed(l)))
- return bpos_cmp(packed_to_bkey_c(l)->p, *r);
-
- if (likely(r_packed))
- return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
-
- return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-static inline struct bset_tree *
-bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
-{
- unsigned offset = __btree_node_key_to_offset(b, k);
-
- for_each_bset(b, t)
- if (offset <= t->end_offset) {
- EBUG_ON(offset < btree_bkey_first_offset(t));
- return t;
- }
-
- BUG();
-}
-
-struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
-
-struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
- struct bkey_packed *, unsigned);
-
-static inline struct bkey_packed *
-bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
- return bch2_bkey_prev_filter(b, t, k, 0);
-}
-
-static inline struct bkey_packed *
-bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
-{
- return bch2_bkey_prev_filter(b, t, k, 1);
-}
-
-/* Btree key iteration */
-
-void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
- struct bpos *);
-void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
- struct btree *);
-struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
- struct btree *,
- struct bset_tree *);
-
-void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
-void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
- struct btree_node_iter_set *);
-void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
-
-#define btree_node_iter_for_each(_iter, _set) \
- for (_set = (_iter)->data; \
- _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \
- (_set)->k != (_set)->end; \
- _set++)
-
-static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
- unsigned i)
-{
- return iter->data[i].k == iter->data[i].end;
-}
-
-static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
-{
- return __btree_node_iter_set_end(iter, 0);
-}
-
-/*
- * When keys compare equal, deleted keys compare first:
- *
- * XXX: only need to compare pointers for keys that are both within a
- * btree_node_iterator - we need to break ties for prev() to work correctly
- */
-static inline int bkey_iter_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed(b, l, r)
- ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
- ?: cmp_int(l, r);
-}
-
-static inline int btree_node_iter_cmp(const struct btree *b,
- struct btree_node_iter_set l,
- struct btree_node_iter_set r)
-{
- return bkey_iter_cmp(b,
- __btree_node_offset_to_key(b, l.k),
- __btree_node_offset_to_key(b, r.k));
-}
-
-/* These assume r (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- return bkey_cmp_left_packed(b, l, r)
- ?: -((int) bkey_deleted(l));
-}
-
-static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r_packed,
- const struct bpos *r)
-{
- return bkey_cmp_p_or_unp(b, l, r_packed, r)
- ?: -((int) bkey_deleted(l));
-}
-
-static inline struct bkey_packed *
-__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- return __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
-{
- return !bch2_btree_node_iter_end(iter)
- ? __btree_node_offset_to_key(b, iter->data->k)
- : NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
-{
- struct bkey_packed *k;
-
- while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
- bkey_deleted(k))
- bch2_btree_node_iter_advance(iter, b);
-
- return k;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
-{
- struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
-
- if (ret)
- bch2_btree_node_iter_advance(iter, b);
-
- return ret;
-}
-
-struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
- struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
- struct btree *);
-
-struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
- struct btree *,
- struct bkey *);
-
-#define for_each_btree_node_key(b, k, iter) \
- for (bch2_btree_node_iter_init_from_start((iter), (b)); \
- (k = bch2_btree_node_iter_peek((iter), (b))); \
- bch2_btree_node_iter_advance(iter, b))
-
-#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
- for (bch2_btree_node_iter_init_from_start((iter), (b)); \
- (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
- bch2_btree_node_iter_advance(iter, b))
-
-/* Accounting: */
-
-struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
-
-static inline void btree_keys_account_key(struct btree_nr_keys *n,
- unsigned bset,
- struct bkey_packed *k,
- int sign)
-{
- n->live_u64s += k->u64s * sign;
- n->bset_u64s[bset] += k->u64s * sign;
-
- if (bkey_packed(k))
- n->packed_keys += sign;
- else
- n->unpacked_keys += sign;
-}
-
-static inline void btree_keys_account_val_delta(struct btree *b,
- struct bkey_packed *k,
- int delta)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
- b->nr.live_u64s += delta;
- b->nr.bset_u64s[t - b->set] += delta;
-}
-
-#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
- btree_keys_account_key(_nr, _bset_idx, _k, 1)
-#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
- btree_keys_account_key(_nr, _bset_idx, _k, -1)
-
-#define btree_account_key_add(_b, _k) \
- btree_keys_account_key(&(_b)->nr, \
- bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
-#define btree_account_key_drop(_b, _k) \
- btree_keys_account_key(&(_b)->nr, \
- bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
-
-struct bset_stats {
- struct {
- size_t nr, bytes;
- } sets[BSET_TREE_NR_TYPES];
-
- size_t floats;
- size_t failed;
-};
-
-void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
-void bch2_bfloat_to_text(struct printbuf *, struct btree *,
- struct bkey_packed *);
-
-/* Debug stuff */
-
-void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
-void bch2_dump_btree_node(struct bch_fs *, struct btree *);
-void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-
-void __bch2_verify_btree_nr_keys(struct btree *);
-void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-
-static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b)
-{
- if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
- __bch2_btree_node_iter_verify(iter, b);
-}
-
-static inline void bch2_verify_btree_nr_keys(struct btree *b)
-{
- if (static_branch_unlikely(&bch2_debug_check_btree_accounting))
- __bch2_verify_btree_nr_keys(b);
-}
-
-#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
deleted file mode 100644
index 83c9860e6b82..000000000000
--- a/fs/bcachefs/btree_cache.c
+++ /dev/null
@@ -1,1516 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sched/mm.h>
-#include <linux/swap.h>
-
-const char * const bch2_btree_node_flags[] = {
- "typebit",
- "typebit",
- "typebit",
-#define x(f) [BTREE_NODE_##f] = #f,
- BTREE_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_recalc_btree_reserve(struct bch_fs *c)
-{
- unsigned reserve = 16;
-
- if (!c->btree_roots_known[0].b)
- reserve += 8;
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (r->b)
- reserve += min_t(unsigned, 1, r->b->c.level) * 8;
- }
-
- c->btree_cache.nr_reserve = reserve;
-}
-
-static inline size_t btree_cache_can_free(struct btree_cache_list *list)
-{
- struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-
- size_t can_free = list->nr;
- if (!list->idx)
- can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
- return can_free;
-}
-
-static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
-
- if (b->c.lock.readers)
- list_add(&b->list, &bc->freed_pcpu);
- else
- list_add(&b->list, &bc->freed_nonpcpu);
-}
-
-static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
- BUG_ON(!b->data);
-
- bc->nr_freeable++;
- list_add(&b->list, &bc->freeable);
-}
-
-void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
-{
- struct btree_cache *bc = &c->btree_cache;
-
- mutex_lock(&bc->lock);
- __bch2_btree_node_to_freelist(bc, b);
- mutex_unlock(&bc->lock);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-}
-
-void __btree_node_data_free(struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
- BUG_ON(btree_node_hashed(b));
-
- /*
- * This should really be done in slub/vmalloc, but we're using the
- * kmalloc_large() path, so we're working around a slub bug by doing
- * this here:
- */
- if (b->data)
- mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
- if (b->aux_data)
- mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
-
- EBUG_ON(btree_node_write_in_flight(b));
-
- clear_btree_node_just_written(b);
-
- kvfree(b->data);
- b->data = NULL;
-#ifdef __KERNEL__
- kvfree(b->aux_data);
-#else
- munmap(b->aux_data, btree_aux_data_bytes(b));
-#endif
- b->aux_data = NULL;
-}
-
-static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(list_empty(&b->list));
- list_del_init(&b->list);
-
- __btree_node_data_free(b);
-
- --bc->nr_freeable;
- btree_node_to_freedlist(bc, b);
-}
-
-static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct btree *b = obj;
- const u64 *v = arg->key;
-
- return b->hash_val == *v ? 0 : 1;
-}
-
-static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, hash_val),
- .key_len = sizeof(u64),
- .obj_cmpfn = bch2_btree_cache_cmp_fn,
- .automatic_shrinking = true,
-};
-
-static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
- BUG_ON(b->data || b->aux_data);
-
- gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
- b->data = kvmalloc(btree_buf_bytes(b), gfp);
- if (!b->data)
- return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
-#ifdef __KERNEL__
- b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
-#else
- b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
- PROT_READ|PROT_WRITE|PROT_EXEC,
- MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
- if (b->aux_data == MAP_FAILED)
- b->aux_data = NULL;
-#endif
- if (!b->aux_data) {
- kvfree(b->data);
- b->data = NULL;
- return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
- }
-
- return 0;
-}
-
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
-{
- struct btree *b;
-
- b = kzalloc(sizeof(struct btree), gfp);
- if (!b)
- return NULL;
-
- bkey_btree_ptr_init(&b->key);
- INIT_LIST_HEAD(&b->list);
- INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(c->opts.btree_node_size);
- return b;
-}
-
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
-{
- struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
- if (!b)
- return NULL;
-
- if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
- kfree(b);
- return NULL;
- }
-
- bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
- return b;
-}
-
-static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
-{
- struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
- u64 mask = bc->pinned_nodes_mask[!!b->c.level];
-
- return ((mask & BIT_ULL(b->c.btree_id)) &&
- bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
- bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
-}
-
-void bch2_node_pin(struct bch_fs *c, struct btree *b)
-{
- struct btree_cache *bc = &c->btree_cache;
-
- mutex_lock(&bc->lock);
- if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
- set_btree_node_pinned(b);
- list_move(&b->list, &bc->live[1].list);
- bc->live[0].nr--;
- bc->live[1].nr++;
- }
- mutex_unlock(&bc->lock);
-}
-
-void bch2_btree_cache_unpin(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b, *n;
-
- mutex_lock(&bc->lock);
- c->btree_cache.pinned_nodes_mask[0] = 0;
- c->btree_cache.pinned_nodes_mask[1] = 0;
-
- list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
- clear_btree_node_pinned(b);
- list_move(&b->list, &bc->live[0].list);
- bc->live[0].nr++;
- bc->live[1].nr--;
- }
-
- mutex_unlock(&bc->lock);
-}
-
-/* Btree in memory cache - hash table */
-
-void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
- lockdep_assert_held(&bc->lock);
-
- int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
- BUG_ON(ret);
-
- /* Cause future lookups for this node to fail: */
- b->hash_val = 0;
-
- if (b->c.btree_id < BTREE_ID_NR)
- --bc->nr_by_btree[b->c.btree_id];
- --bc->live[btree_node_pinned(b)].nr;
- list_del_init(&b->list);
-}
-
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
-{
- __bch2_btree_node_hash_remove(bc, b);
- __bch2_btree_node_to_freelist(bc, b);
-}
-
-int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
-{
- BUG_ON(!list_empty(&b->list));
- BUG_ON(b->hash_val);
-
- b->hash_val = btree_ptr_hash_val(&b->key);
- int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
- bch_btree_cache_params);
- if (ret)
- return ret;
-
- if (b->c.btree_id < BTREE_ID_NR)
- bc->nr_by_btree[b->c.btree_id]++;
-
- bool p = __btree_node_pinned(bc, b);
- mod_bit(BTREE_NODE_pinned, &b->flags, p);
-
- list_add_tail(&b->list, &bc->live[p].list);
- bc->live[p].nr++;
- return 0;
-}
-
-int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
- unsigned level, enum btree_id id)
-{
- b->c.level = level;
- b->c.btree_id = id;
-
- mutex_lock(&bc->lock);
- int ret = __bch2_btree_node_hash_insert(bc, b);
- mutex_unlock(&bc->lock);
-
- return ret;
-}
-
-void bch2_btree_node_update_key_early(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_i *new)
-{
- struct bch_fs *c = trans->c;
- struct btree *b;
- struct bkey_buf tmp;
- int ret;
-
- bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_reassemble(&tmp, c, old);
-
- b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
- if (!IS_ERR_OR_NULL(b)) {
- mutex_lock(&c->btree_cache.lock);
-
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, new);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
-
- mutex_unlock(&c->btree_cache.lock);
- six_unlock_read(&b->c.lock);
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-__flatten
-static inline struct btree *btree_cache_find(struct btree_cache *bc,
- const struct bkey_i *k)
-{
- u64 v = btree_ptr_hash_val(k);
-
- return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
-}
-
-static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
- bool flush, bool locked)
-{
- struct btree_cache *bc = &c->btree_cache;
-
- lockdep_assert_held(&bc->lock);
-
- if (btree_node_noevict(b)) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
- if (btree_node_write_blocked(b)) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
- if (btree_node_will_make_reachable(b)) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
-
- if (btree_node_dirty(b)) {
- if (!flush) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
-
- if (locked) {
- /*
- * Using the underscore version because we don't want to compact
- * bsets after the write, since this node is about to be evicted
- * - unless btree verify mode is enabled, since it runs out of
- * the post write cleanup:
- */
- if (static_branch_unlikely(&bch2_verify_btree_ondisk))
- bch2_btree_node_write(c, b, SIX_LOCK_intent,
- BTREE_WRITE_cache_reclaim);
- else
- __bch2_btree_node_write(c, b,
- BTREE_WRITE_cache_reclaim);
- }
- }
-
- if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
- (1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_read_in_flight(b))
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
- else if (btree_node_write_in_flight(b))
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
-
- if (locked)
- return -EINTR;
-
- /* XXX: waiting on IO with btree cache lock held */
- bch2_btree_node_wait_on_read(b);
- bch2_btree_node_wait_on_write(b);
- }
-
- return 0;
-}
-
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
-{
- struct btree_cache *bc = &c->btree_cache;
- int ret = 0;
-
- lockdep_assert_held(&bc->lock);
-retry_unlocked:
- ret = __btree_node_reclaim_checks(c, b, flush, false);
- if (ret)
- return ret;
-
- if (!six_trylock_intent(&b->c.lock)) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
-
- if (!six_trylock_write(&b->c.lock)) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
- six_unlock_intent(&b->c.lock);
- return bch_err_throw(c, ENOMEM_btree_node_reclaim);
- }
-
- /* recheck under lock */
- ret = __btree_node_reclaim_checks(c, b, flush, true);
- if (ret) {
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- if (ret == -EINTR)
- goto retry_unlocked;
- return ret;
- }
-
- if (b->hash_val && !ret)
- trace_and_count(c, btree_cache_reap, c, b);
- return 0;
-}
-
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
-{
- return __btree_node_reclaim(c, b, false);
-}
-
-static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
-{
- return __btree_node_reclaim(c, b, true);
-}
-
-static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct btree_cache_list *list = shrink->private_data;
- struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
- struct btree *b, *t;
- unsigned long nr = sc->nr_to_scan;
- unsigned long can_free = 0;
- unsigned long freed = 0;
- unsigned long touched = 0;
- unsigned i, flags;
- unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
-
- if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
- return SHRINK_STOP;
-
- mutex_lock(&bc->lock);
- flags = memalloc_nofs_save();
-
- /*
- * It's _really_ critical that we don't free too many btree nodes - we
- * have to always leave ourselves a reserve. The reserve is how we
- * guarantee that allocating memory for a new btree node can always
- * succeed, so that inserting keys into the btree can always succeed and
- * IO can always make forward progress:
- */
- can_free = btree_cache_can_free(list);
- if (nr > can_free) {
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
- nr = can_free;
- }
-
- i = 0;
- list_for_each_entry_safe(b, t, &bc->freeable, list) {
- /*
- * Leave a few nodes on the freeable list, so that a btree split
- * won't have to hit the system allocator:
- */
- if (++i <= 3)
- continue;
-
- touched++;
-
- if (touched >= nr)
- goto out;
-
- if (!btree_node_reclaim(c, b)) {
- btree_node_data_free(bc, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- freed++;
- bc->nr_freed++;
- }
- }
-restart:
- list_for_each_entry_safe(b, t, &list->list, list) {
- touched++;
-
- if (btree_node_accessed(b)) {
- clear_btree_node_accessed(b);
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
- --touched;;
- } else if (!btree_node_reclaim(c, b)) {
- __bch2_btree_node_hash_remove(bc, b);
- __btree_node_data_free(b);
- btree_node_to_freedlist(bc, b);
-
- freed++;
- bc->nr_freed++;
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- if (freed == nr)
- goto out_rotate;
- } else if (trigger_writes &&
- btree_node_dirty(b) &&
- !btree_node_will_make_reachable(b) &&
- !btree_node_write_blocked(b) &&
- six_trylock_read(&b->c.lock)) {
- list_move(&list->list, &b->list);
- mutex_unlock(&bc->lock);
- __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
- six_unlock_read(&b->c.lock);
- if (touched >= nr)
- goto out_nounlock;
- mutex_lock(&bc->lock);
- goto restart;
- }
-
- if (touched >= nr)
- break;
- }
-out_rotate:
- if (&t->list != &list->list)
- list_move_tail(&list->list, &t->list);
-out:
- mutex_unlock(&bc->lock);
-out_nounlock:
- ret = freed;
- memalloc_nofs_restore(flags);
- trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
- return ret;
-}
-
-static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct btree_cache_list *list = shrink->private_data;
-
- if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
- return 0;
-
- return btree_cache_can_free(list);
-}
-
-void bch2_fs_btree_cache_exit(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b, *t;
- unsigned long flags;
-
- shrinker_free(bc->live[1].shrink);
- shrinker_free(bc->live[0].shrink);
-
- /* vfree() can allocate memory: */
- flags = memalloc_nofs_save();
- mutex_lock(&bc->lock);
-
- if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live[0].list);
-
- kvfree(c->verify_ondisk);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (r->b)
- list_add(&r->b->list, &bc->live[0].list);
- }
-
- list_for_each_entry_safe(b, t, &bc->live[1].list, list)
- bch2_btree_node_hash_remove(bc, b);
- list_for_each_entry_safe(b, t, &bc->live[0].list, list)
- bch2_btree_node_hash_remove(bc, b);
-
- list_for_each_entry_safe(b, t, &bc->freeable, list) {
- BUG_ON(btree_node_read_in_flight(b) ||
- btree_node_write_in_flight(b));
-
- btree_node_data_free(bc, b);
- cond_resched();
- }
-
- BUG_ON(!bch2_journal_error(&c->journal) &&
- atomic_long_read(&c->btree_cache.nr_dirty));
-
- list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
-
- list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
- list_del(&b->list);
- six_lock_exit(&b->c.lock);
- kfree(b);
- }
-
- mutex_unlock(&bc->lock);
- memalloc_nofs_restore(flags);
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
- BUG_ON(bc->nr_by_btree[i]);
- BUG_ON(bc->live[0].nr);
- BUG_ON(bc->live[1].nr);
- BUG_ON(bc->nr_freeable);
-
- if (bc->table_init_done)
- rhashtable_destroy(&bc->table);
-}
-
-int bch2_fs_btree_cache_init(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct shrinker *shrink;
- unsigned i;
- int ret = 0;
-
- ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
- if (ret)
- goto err;
-
- bc->table_init_done = true;
-
- bch2_recalc_btree_reserve(c);
-
- for (i = 0; i < bc->nr_reserve; i++) {
- struct btree *b = __bch2_btree_node_mem_alloc(c);
- if (!b)
- goto err;
- __bch2_btree_node_to_freelist(bc, b);
- }
-
- list_splice_init(&bc->live[0].list, &bc->freeable);
-
- mutex_init(&c->verify_lock);
-
- shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
- if (!shrink)
- goto err;
- bc->live[0].shrink = shrink;
- shrink->count_objects = bch2_btree_cache_count;
- shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 2;
- shrink->private_data = &bc->live[0];
- shrinker_register(shrink);
-
- shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
- if (!shrink)
- goto err;
- bc->live[1].shrink = shrink;
- shrink->count_objects = bch2_btree_cache_count;
- shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 8;
- shrink->private_data = &bc->live[1];
- shrinker_register(shrink);
-
- return 0;
-err:
- return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-}
-
-void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
-{
- mutex_init(&bc->lock);
- for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
- bc->live[i].idx = i;
- INIT_LIST_HEAD(&bc->live[i].list);
- }
- INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed_pcpu);
- INIT_LIST_HEAD(&bc->freed_nonpcpu);
-}
-
-/*
- * We can only have one thread cannibalizing other cached btree nodes at a time,
- * or we'll deadlock. We use an open coded mutex to ensure that, which a
- * cannibalize_bucket() will take. This means every time we unlock the root of
- * the btree, we need to release this lock if we have it held.
- */
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
-
- if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, trans);
- bc->alloc_lock = NULL;
- closure_wake_up(&bc->alloc_wait);
- }
-}
-
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct task_struct *old;
-
- old = NULL;
- if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
- goto success;
-
- if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
- }
-
- closure_wait(&bc->alloc_wait, cl);
-
- /* Try again, after adding ourselves to waitlist */
- old = NULL;
- if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
- /* We raced */
- closure_wake_up(&bc->alloc_wait);
- goto success;
- }
-
- trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
-
-success:
- trace_and_count(c, btree_cache_cannibalize_lock, trans);
- return 0;
-}
-
-static struct btree *btree_node_cannibalize(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
- list_for_each_entry_reverse(b, &bc->live[i].list, list)
- if (!btree_node_reclaim(c, b))
- return b;
-
- while (1) {
- for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
- list_for_each_entry_reverse(b, &bc->live[i].list, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
-
- /*
- * Rare case: all nodes were intent-locked.
- * Just busy-wait.
- */
- WARN_ONCE(1, "btree cache cannibalize failed\n");
- cond_resched();
- }
-}
-
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct list_head *freed = pcpu_read_locks
- ? &bc->freed_pcpu
- : &bc->freed_nonpcpu;
- struct btree *b, *b2;
- u64 start_time = local_clock();
-
- mutex_lock(&bc->lock);
-
- /*
- * We never free struct btree itself, just the memory that holds the on
- * disk node. Check the freed list before allocating a new one:
- */
- list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b)) {
- list_del_init(&b->list);
- goto got_node;
- }
-
- b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
- if (b) {
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
- } else {
- mutex_unlock(&bc->lock);
- bch2_trans_unlock(trans);
- b = __btree_node_mem_alloc(c, GFP_KERNEL);
- if (!b)
- goto err;
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
- mutex_lock(&bc->lock);
- }
-
- BUG_ON(!six_trylock_intent(&b->c.lock));
- BUG_ON(!six_trylock_write(&b->c.lock));
-
-got_node:
- /*
- * btree_free() doesn't free memory; it sticks the node on the end of
- * the list. Check if there's any freed nodes there:
- */
- list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2)) {
- swap(b->data, b2->data);
- swap(b->aux_data, b2->aux_data);
-
- list_del_init(&b2->list);
- --bc->nr_freeable;
- btree_node_to_freedlist(bc, b2);
- mutex_unlock(&bc->lock);
-
- six_unlock_write(&b2->c.lock);
- six_unlock_intent(&b2->c.lock);
- goto got_mem;
- }
-
- mutex_unlock(&bc->lock);
-
- if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
- bch2_trans_unlock(trans);
- if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
- goto err;
- }
-
-got_mem:
- BUG_ON(!list_empty(&b->list));
- BUG_ON(btree_node_hashed(b));
- BUG_ON(btree_node_dirty(b));
- BUG_ON(btree_node_write_in_flight(b));
-out:
- b->flags = 0;
- b->written = 0;
- b->nsets = 0;
- b->sib_u64s[0] = 0;
- b->sib_u64s[1] = 0;
- b->whiteout_u64s = 0;
- bch2_btree_keys_init(b);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
- start_time);
-
- int ret = bch2_trans_relock(trans);
- if (unlikely(ret)) {
- bch2_btree_node_to_freelist(c, b);
- return ERR_PTR(ret);
- }
-
- return b;
-err:
- mutex_lock(&bc->lock);
-
- /* Try to cannibalize another cached btree node: */
- if (bc->alloc_lock == current) {
- b2 = btree_node_cannibalize(c);
- clear_btree_node_just_written(b2);
- __bch2_btree_node_hash_remove(bc, b2);
-
- if (b) {
- swap(b->data, b2->data);
- swap(b->aux_data, b2->aux_data);
- btree_node_to_freedlist(bc, b2);
- six_unlock_write(&b2->c.lock);
- six_unlock_intent(&b2->c.lock);
- } else {
- b = b2;
- }
-
- BUG_ON(!list_empty(&b->list));
- mutex_unlock(&bc->lock);
-
- trace_and_count(c, btree_cache_cannibalize, trans);
- goto out;
- }
-
- mutex_unlock(&bc->lock);
- return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
-}
-
-/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
- struct btree_path *path,
- const struct bkey_i *k,
- enum btree_id btree_id,
- unsigned level,
- enum six_lock_type lock_type,
- bool sync)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- if (unlikely(level >= BTREE_MAX_DEPTH)) {
- int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
- level, BTREE_MAX_DEPTH);
- return ERR_PTR(ret);
- }
-
- if (unlikely(!bkey_is_btree_ptr(&k->k))) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
- int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
- printbuf_exit(&buf);
- return ERR_PTR(ret);
- }
-
- if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
- int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
- printbuf_exit(&buf);
- return ERR_PTR(ret);
- }
-
- /*
- * Parent node must be locked, else we could read in a btree node that's
- * been freed:
- */
- if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
- trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
- }
-
- b = bch2_btree_node_mem_alloc(trans, level != 0);
-
- if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
- if (!path)
- return b;
-
- trans->memory_allocation_failure = true;
- trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
- }
-
- if (IS_ERR(b))
- return b;
-
- bkey_copy(&b->key, k);
- if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
- /* raced with another fill: */
-
- /* mark as unhashed... */
- b->hash_val = 0;
-
- mutex_lock(&bc->lock);
- __bch2_btree_node_to_freelist(bc, b);
- mutex_unlock(&bc->lock);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- return NULL;
- }
-
- set_btree_node_read_in_flight(b);
- six_unlock_write(&b->c.lock);
-
- if (path) {
- u32 seq = six_lock_seq(&b->c.lock);
-
- /* Unlock before doing IO: */
- six_unlock_intent(&b->c.lock);
- bch2_trans_unlock(trans);
-
- bch2_btree_node_read(trans, b, sync);
-
- int ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
-
- if (!sync)
- return NULL;
-
- if (!six_relock_type(&b->c.lock, lock_type, seq))
- b = NULL;
- } else {
- bch2_btree_node_read(trans, b, sync);
- if (lock_type == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
- }
-
- return b;
-}
-
-static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
-{
- struct printbuf buf = PRINTBUF;
-
- if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
- return;
-
- prt_printf(&buf,
- "btree node header doesn't match ptr: ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, "\nptr: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- prt_str(&buf, "\nheader: ");
- bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
- prt_str(&buf, "\nmin ");
- bch2_bpos_to_text(&buf, b->data->min_key);
-
- prt_printf(&buf, "\nmax ");
- bch2_bpos_to_text(&buf, b->data->max_key);
-
- bch2_fs_topology_error(c, "%s", buf.buf);
-
- printbuf_exit(&buf);
-}
-
-static inline void btree_check_header(struct bch_fs *c, struct btree *b)
-{
- if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
- b->c.level != BTREE_NODE_LEVEL(b->data) ||
- !bpos_eq(b->data->max_key, b->key.k.p) ||
- (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- !bpos_eq(b->data->min_key,
- bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
- btree_bad_header(c, b);
-}
-
-static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
- const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- bool need_relock = false;
- int ret;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-retry:
- b = btree_cache_find(bc, k);
- if (unlikely(!b)) {
- /*
- * We must have the parent locked to call bch2_btree_node_fill(),
- * else we could read in a btree node from disk that's been
- * freed:
- */
- b = bch2_btree_node_fill(trans, path, k, path->btree_id,
- level, lock_type, true);
- need_relock = true;
-
- /* We raced and found the btree node in the cache */
- if (!b)
- goto retry;
-
- if (IS_ERR(b))
- return b;
- } else {
- if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(trans, path, level + 1);
-
- ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ERR_PTR(ret);
-
- BUG_ON(ret);
-
- if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
- b->c.level != level ||
- race_fault())) {
- six_unlock_type(&b->c.lock, lock_type);
- if (bch2_btree_node_relock(trans, path, level + 1))
- goto retry;
-
- trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
- }
-
- if (unlikely(btree_node_read_in_flight(b))) {
- u32 seq = six_lock_seq(&b->c.lock);
-
- six_unlock_type(&b->c.lock, lock_type);
- bch2_trans_unlock(trans);
- need_relock = true;
-
- bch2_btree_node_wait_on_read(b);
-
- ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
-
- /*
- * should_be_locked is not set on this path yet, so we need to
- * relock it specifically:
- */
- if (!six_relock_type(&b->c.lock, lock_type, seq))
- goto retry;
- }
-
- if (unlikely(need_relock)) {
- ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(ret);
- }
- }
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
- }
-
- EBUG_ON(b->c.btree_id != path->btree_id);
- EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- btree_check_header(c, b);
-
- return b;
-}
-
-/**
- * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * @trans: btree transaction object
- * @path: btree_path being traversed
- * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
- * @level: level of btree node being looked up (0 == leaf node)
- * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
- * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- *
- * Returns: btree node or ERR_PTR()
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
- const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree *b;
- int ret;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- b = btree_node_mem_ptr(k);
-
- /*
- * Check b->hash_val _before_ calling btree_node_lock() - this might not
- * be the node we want anymore, and trying to lock the wrong node could
- * cause an unneccessary transaction restart:
- */
- if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
- !b ||
- b->hash_val != btree_ptr_hash_val(k)))
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
- if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(trans, path, level + 1);
-
- ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ERR_PTR(ret);
-
- BUG_ON(ret);
-
- if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
- b->c.level != level ||
- race_fault())) {
- six_unlock_type(&b->c.lock, lock_type);
- if (bch2_btree_node_relock(trans, path, level + 1))
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
-
- trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
- }
-
- if (unlikely(btree_node_read_in_flight(b))) {
- six_unlock_type(&b->c.lock, lock_type);
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
- }
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_type(&b->c.lock, lock_type);
- return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
- }
-
- EBUG_ON(b->c.btree_id != path->btree_id);
- EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- btree_check_header(c, b);
-
- return b;
-}
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
- const struct bkey_i *k,
- enum btree_id btree_id,
- unsigned level,
- bool nofill)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- int ret;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- if (c->opts.btree_node_mem_ptr_optimization) {
- b = btree_node_mem_ptr(k);
- if (b)
- goto lock_node;
- }
-retry:
- b = btree_cache_find(bc, k);
- if (unlikely(!b)) {
- if (nofill)
- goto out;
-
- b = bch2_btree_node_fill(trans, NULL, k, btree_id,
- level, SIX_LOCK_read, true);
-
- /* We raced and found the btree node in the cache */
- if (!b)
- goto retry;
-
- if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(trans, NULL))
- goto retry;
-
- if (IS_ERR(b))
- goto out;
- } else {
-lock_node:
- ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ERR_PTR(ret);
-
- BUG_ON(ret);
-
- if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
- b->c.btree_id != btree_id ||
- b->c.level != level)) {
- six_unlock_read(&b->c.lock);
- goto retry;
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
- }
-
- /* XXX: waiting on IO with btree locks held: */
- __bch2_btree_node_wait_on_read(b);
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_read(&b->c.lock);
- b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
- goto out;
- }
-
- EBUG_ON(b->c.btree_id != btree_id);
- EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
- btree_check_header(c, b);
-out:
- bch2_btree_cache_cannibalize_unlock(trans);
- return b;
-}
-
-int bch2_btree_node_prefetch(struct btree_trans *trans,
- struct btree_path *path,
- const struct bkey_i *k,
- enum btree_id btree_id, unsigned level)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
-
- BUG_ON(path && !btree_node_locked(path, level + 1));
- BUG_ON(level >= BTREE_MAX_DEPTH);
-
- struct btree *b = btree_cache_find(bc, k);
- if (b)
- return 0;
-
- b = bch2_btree_node_fill(trans, path, k, btree_id,
- level, SIX_LOCK_read, false);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- return ret;
- if (b)
- six_unlock_read(&b->c.lock);
- return 0;
-}
-
-void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
-{
- struct bch_fs *c = trans->c;
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- b = btree_cache_find(bc, k);
- if (!b)
- return;
-
- BUG_ON(b == btree_node_root(trans->c, b));
-wait_on_io:
- /* not allowed to wait on io with btree locks held: */
-
- /* XXX we're called from btree_gc which will be holding other btree
- * nodes locked
- */
- __bch2_btree_node_wait_on_read(b);
- __bch2_btree_node_wait_on_write(b);
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
- goto out;
-
- if (btree_node_dirty(b)) {
- __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-
- BUG_ON(btree_node_dirty(b));
-
- mutex_lock(&bc->lock);
- bch2_btree_node_hash_remove(bc, b);
- btree_node_data_free(bc, b);
- mutex_unlock(&bc->lock);
-out:
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-}
-
-const char *bch2_btree_id_str(enum btree_id btree)
-{
- return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
-}
-
-void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
-{
- if (btree < BTREE_ID_NR)
- prt_str(out, __bch2_btree_ids[btree]);
- else
- prt_printf(out, "(unknown btree %u)", btree);
-}
-
-void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
-{
- prt_str(out, "btree=");
- bch2_btree_id_to_text(out, btree);
- prt_printf(out, " level=%u", level);
-}
-
-void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
- enum btree_id btree, unsigned level, struct bkey_s_c k)
-{
- bch2_btree_id_to_text(out, btree);
- prt_printf(out, " level %u/", level);
- struct btree_root *r = bch2_btree_id_root(c, btree);
- if (r)
- prt_printf(out, "%u", r->level);
- else
- prt_printf(out, "(unknown)");
- prt_newline(out);
-
- bch2_bkey_val_to_text(out, c, k);
-}
-
-void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
- __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
-}
-
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
-{
- struct bset_stats stats;
-
- memset(&stats, 0, sizeof(stats));
-
- bch2_btree_keys_stats(b, &stats);
-
- prt_printf(out, "l %u ", b->c.level);
- bch2_bpos_to_text(out, b->data->min_key);
- prt_printf(out, " - ");
- bch2_bpos_to_text(out, b->data->max_key);
- prt_printf(out, ":\n"
- " ptrs: ");
- bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
- prt_newline(out);
-
- prt_printf(out,
- " format: ");
- bch2_bkey_format_to_text(out, &b->format);
-
- prt_printf(out,
- " unpack fn len: %u\n"
- " bytes used %zu/%zu (%zu%% full)\n"
- " sib u64s: %u, %u (merge threshold %u)\n"
- " nr packed keys %u\n"
- " nr unpacked keys %u\n"
- " floats %zu\n"
- " failed unpacked %zu\n",
- b->unpack_fn_len,
- b->nr.live_u64s * sizeof(u64),
- btree_buf_bytes(b) - sizeof(struct btree_node),
- b->nr.live_u64s * 100 / btree_max_u64s(c),
- b->sib_u64s[0],
- b->sib_u64s[1],
- c->btree_foreground_merge_threshold,
- b->nr.packed_keys,
- b->nr.unpacked_keys,
- stats.floats,
- stats.failed);
-}
-
-static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
- const char *label, size_t nr)
-{
- prt_printf(out, "%s\t", label);
- prt_human_readable_u64(out, nr * c->opts.btree_node_size);
- prt_printf(out, " (%zu)\n", nr);
-}
-
-static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
-#define x(n) #n,
- BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
- NULL
-};
-
-void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
-
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
- prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
- prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve);
- prt_btree_cache_line(out, c, "freed:", bc->nr_freeable);
- prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held");
- prt_newline(out);
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
- bch2_btree_id_to_text(out, i);
- prt_printf(out, "\t");
- prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
- prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
- }
-
- prt_newline(out);
- prt_printf(out, "counters since mount:\n");
- prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
- prt_printf(out, "not freed:\n");
-
- for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
- prt_printf(out, " %s\t%llu\n",
- bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
-}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
deleted file mode 100644
index be275f87a60e..000000000000
--- a/fs/bcachefs/btree_cache.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_CACHE_H
-#define _BCACHEFS_BTREE_CACHE_H
-
-#include "bcachefs.h"
-#include "btree_types.h"
-#include "bkey_methods.h"
-
-extern const char * const bch2_btree_node_flags[];
-
-struct btree_iter;
-
-void bch2_recalc_btree_reserve(struct bch_fs *);
-
-void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
-
-void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
-
-int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
-int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
- unsigned, enum btree_id);
-
-void bch2_node_pin(struct bch_fs *, struct btree *);
-void bch2_btree_cache_unpin(struct bch_fs *);
-
-void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *);
-
-void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
-int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
-
-void __btree_node_data_free(struct btree *);
-struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
-
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
- const struct bkey_i *, unsigned,
- enum six_lock_type, unsigned long);
-
-struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
- enum btree_id, unsigned, bool);
-
-int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
- const struct bkey_i *, enum btree_id, unsigned);
-
-void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
-
-void bch2_fs_btree_cache_exit(struct bch_fs *);
-int bch2_fs_btree_cache_init(struct bch_fs *);
-void bch2_fs_btree_cache_init_early(struct btree_cache *);
-
-static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
-{
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
- case KEY_TYPE_btree_ptr_v2:
- /*
- * The cast/deref is only necessary to avoid sparse endianness
- * warnings:
- */
- return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
- default:
- return 0;
- }
-}
-
-static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
-{
- return k->k.type == KEY_TYPE_btree_ptr_v2
- ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
- : NULL;
-}
-
-/* is btree node in hash table? */
-static inline bool btree_node_hashed(struct btree *b)
-{
- return b->hash_val != 0;
-}
-
-#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
- for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
- &(_c)->btree_cache.table), \
- _iter = 0; _iter < (_tbl)->size; _iter++) \
- rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-
-static inline size_t btree_buf_bytes(const struct btree *b)
-{
- return 1UL << b->byte_order;
-}
-
-static inline size_t btree_buf_max_u64s(const struct btree *b)
-{
- return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_max_u64s(const struct bch_fs *c)
-{
- return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> SECTOR_SHIFT;
-}
-
-static inline unsigned btree_blocks(const struct bch_fs *c)
-{
- return btree_sectors(c) >> c->block_bits;
-}
-
-#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3)
-
-#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
-#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
-
-static inline unsigned btree_id_nr_alive(struct bch_fs *c)
-{
- return BTREE_ID_NR + c->btree_roots_extra.nr;
-}
-
-static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
-{
- if (likely(id < BTREE_ID_NR)) {
- return &c->btree_roots_known[id];
- } else {
- unsigned idx = id - BTREE_ID_NR;
-
- /* This can happen when we're called from btree_node_scan */
- if (idx >= c->btree_roots_extra.nr)
- return NULL;
-
- return &c->btree_roots_extra.data[idx];
- }
-}
-
-static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
-{
- struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
-
- return r ? r->b : NULL;
-}
-
-const char *bch2_btree_id_str(enum btree_id); /* avoid */
-void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
-void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
-
-void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
- enum btree_id, unsigned, struct bkey_s_c);
-void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
-
-#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
deleted file mode 100644
index bac108e93823..000000000000
--- a/fs/bcachefs/btree_gc.c
+++ /dev/null
@@ -1,1308 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright (C) 2014 Datera Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "progress.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched/task.h>
-
-#define DROP_THIS_NODE 10
-#define DROP_PREV_NODE 11
-#define DID_FILL_FROM_SCAN 12
-
-/*
- * Returns true if it's a btree we can easily reconstruct, or otherwise won't
- * cause data loss if it's missing:
- */
-static bool btree_id_important(enum btree_id btree)
-{
- if (btree_id_is_alloc(btree))
- return false;
-
- switch (btree) {
- case BTREE_ID_quotas:
- case BTREE_ID_snapshot_trees:
- case BTREE_ID_logged_ops:
- case BTREE_ID_rebalance_work:
- case BTREE_ID_subvolume_children:
- return false;
- default:
- return true;
- }
-}
-
-static const char * const bch2_gc_phase_strs[] = {
-#define x(n) #n,
- GC_PHASES()
-#undef x
- NULL
-};
-
-void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
-{
- prt_str(out, bch2_gc_phase_strs[p->phase]);
- prt_char(out, ' ');
- bch2_btree_id_level_to_text(out, p->btree, p->level);
- prt_char(out, ' ');
- bch2_bpos_to_text(out, p->pos);
-}
-
-static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
-{
- return (struct bkey_s) {{{
- (struct bkey *) k.k,
- (struct bch_val *) k.v
- }}};
-}
-
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
- preempt_disable();
- write_seqcount_begin(&c->gc_pos_lock);
- c->gc_pos = new_pos;
- write_seqcount_end(&c->gc_pos_lock);
- preempt_enable();
-}
-
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
- BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
- __gc_pos_set(c, new_pos);
-}
-
-static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
-{
- switch (b->key.k.type) {
- case KEY_TYPE_btree_ptr: {
- struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
-
- dst->k.p = src->k.p;
- dst->v.mem_ptr = 0;
- dst->v.seq = b->data->keys.seq;
- dst->v.sectors_written = 0;
- dst->v.flags = 0;
- dst->v.min_key = b->data->min_key;
- set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
- memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
- break;
- }
- case KEY_TYPE_btree_ptr_v2:
- bkey_copy(&dst->k_i, &b->key);
- break;
- default:
- BUG();
- }
-}
-
-static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
-{
- struct bkey_i_btree_ptr_v2 *new;
- int ret;
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, " -> ");
- bch2_bpos_to_text(&buf, new_min);
-
- bch_info(c, "%s(): %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
- if (!new)
- return bch_err_throw(c, ENOMEM_gc_repair_key);
-
- btree_ptr_to_v2(b, new);
- b->data->min_key = new_min;
- new->v.min_key = new_min;
- SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
- ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- if (ret) {
- kfree(new);
- return ret;
- }
-
- bch2_btree_node_drop_keys_outside_node(b);
- bkey_copy(&b->key, &new->k_i);
- return 0;
-}
-
-static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
-{
- struct bkey_i_btree_ptr_v2 *new;
- int ret;
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, " -> ");
- bch2_bpos_to_text(&buf, new_max);
-
- bch_info(c, "%s(): %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
- if (ret)
- return ret;
-
- new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
- if (!new)
- return bch_err_throw(c, ENOMEM_gc_repair_key);
-
- btree_ptr_to_v2(b, new);
- b->data->max_key = new_max;
- new->k.p = new_max;
- SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
-
- ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
- if (ret) {
- kfree(new);
- return ret;
- }
-
- bch2_btree_node_drop_keys_outside_node(b);
-
- mutex_lock(&c->btree_cache.lock);
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, &new->k_i);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
- mutex_unlock(&c->btree_cache.lock);
- return 0;
-}
-
-static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
- struct btree *prev, struct btree *cur,
- struct bpos *pulled_from_scan)
-{
- struct bch_fs *c = trans->c;
- struct bpos expected_start = !prev
- ? b->data->min_key
- : bpos_successor(prev->key.k.p);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
- b->data->min_key));
-
- if (bpos_eq(expected_start, cur->data->min_key))
- return 0;
-
- prt_printf(&buf, " at ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_printf(&buf, ":\nparent: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- if (prev) {
- prt_printf(&buf, "\nprev: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
- }
-
- prt_str(&buf, "\nnext: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
-
- if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
- if (b->c.level == 1 &&
- bpos_lt(*pulled_from_scan, cur->data->min_key)) {
- ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
- expected_start,
- bpos_predecessor(cur->data->min_key));
- if (ret)
- goto err;
-
- *pulled_from_scan = cur->data->min_key;
- ret = DID_FILL_FROM_SCAN;
- } else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
- ret = set_node_min(c, cur, expected_start);
- }
- } else { /* overlap */
- if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
- if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
- if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
- "btree node overwritten by next node%s", buf.buf))
- ret = DROP_PREV_NODE;
- } else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
- "btree node with incorrect max_key%s", buf.buf))
- ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- }
- } else {
- if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
- if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
- "btree node overwritten by prev node%s", buf.buf))
- ret = DROP_THIS_NODE;
- } else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
- ret = set_node_min(c, cur, expected_start);
- }
- }
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
- struct btree *child, struct bpos *pulled_from_scan)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (bpos_eq(child->key.k.p, b->key.k.p))
- return 0;
-
- prt_printf(&buf, "\nat: ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_printf(&buf, "\nparent: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- prt_str(&buf, "\nchild: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
-
- if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
- "btree node with incorrect max_key%s", buf.buf)) {
- if (b->c.level == 1 &&
- bpos_lt(*pulled_from_scan, b->key.k.p)) {
- ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
- bpos_successor(child->key.k.p), b->key.k.p);
- if (ret)
- goto err;
-
- *pulled_from_scan = b->key.k.p;
- ret = DID_FILL_FROM_SCAN;
- } else {
- ret = set_node_max(c, child, b->key.k.p);
- }
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
- struct bpos *pulled_from_scan)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf prev_k, cur_k;
- struct btree *prev = NULL, *cur = NULL;
- bool have_child, new_pass = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (!b->c.level)
- return 0;
-
- bch2_bkey_buf_init(&prev_k);
- bch2_bkey_buf_init(&cur_k);
-again:
- cur = prev = NULL;
- have_child = new_pass = false;
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- iter.prefetch = true;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
-
- bch2_btree_and_journal_iter_advance(&iter);
- bch2_bkey_buf_reassemble(&cur_k, c, k);
-
- cur = bch2_btree_node_get_noiter(trans, cur_k.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(cur);
-
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
-
- if (bch2_err_matches(ret, EIO)) {
- bch2_btree_node_evict(trans, cur_k.k);
- cur = NULL;
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- if (ret)
- break;
- continue;
- }
-
- bch_err_msg(c, ret, "getting btree node");
- if (ret)
- break;
-
- if (bch2_btree_node_is_stale(c, cur)) {
- bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
- six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- cur = NULL;
- if (ret)
- break;
- continue;
- }
-
- ret = lockrestart_do(trans,
- btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
- if (ret < 0)
- goto err;
-
- if (ret == DID_FILL_FROM_SCAN) {
- new_pass = true;
- ret = 0;
- }
-
- if (ret == DROP_THIS_NODE) {
- six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- cur = NULL;
- if (ret)
- break;
- continue;
- }
-
- if (prev)
- six_unlock_read(&prev->c.lock);
- prev = NULL;
-
- if (ret == DROP_PREV_NODE) {
- bch_info(c, "dropped prev node");
- bch2_btree_node_evict(trans, prev_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, prev_k.k->k.p);
- if (ret)
- break;
-
- bch2_btree_and_journal_iter_exit(&iter);
- goto again;
- } else if (ret)
- break;
-
- prev = cur;
- cur = NULL;
- bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
- }
-
- if (!ret && !IS_ERR_OR_NULL(prev)) {
- BUG_ON(cur);
- ret = lockrestart_do(trans,
- btree_repair_node_end(trans, b, prev, pulled_from_scan));
- if (ret == DID_FILL_FROM_SCAN) {
- new_pass = true;
- ret = 0;
- }
- }
-
- if (!IS_ERR_OR_NULL(prev))
- six_unlock_read(&prev->c.lock);
- prev = NULL;
- if (!IS_ERR_OR_NULL(cur))
- six_unlock_read(&cur->c.lock);
- cur = NULL;
-
- if (ret)
- goto err;
-
- bch2_btree_and_journal_iter_exit(&iter);
-
- if (new_pass)
- goto again;
-
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
- iter.prefetch = true;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&cur_k, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
-
- cur = bch2_btree_node_get_noiter(trans, cur_k.k,
- b->c.btree_id, b->c.level - 1,
- false);
- ret = PTR_ERR_OR_ZERO(cur);
-
- bch_err_msg(c, ret, "getting btree node");
- if (ret)
- goto err;
-
- ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
- six_unlock_read(&cur->c.lock);
- cur = NULL;
-
- if (ret == DROP_THIS_NODE) {
- bch2_btree_node_evict(trans, cur_k.k);
- ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
- new_pass = true;
- }
-
- if (ret)
- goto err;
-
- have_child = true;
- }
-
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- /*
- * XXX: we're not passing the trans object here because we're not set up
- * to handle a transaction restart - this code needs to be rewritten
- * when we start doing online topology repair
- */
- bch2_trans_unlock_long(trans);
- if (mustfix_fsck_err_on(!have_child,
- c, btree_node_topology_interior_node_empty,
- "empty interior btree node at %s", buf.buf))
- ret = DROP_THIS_NODE;
-err:
-fsck_err:
- if (!IS_ERR_OR_NULL(prev))
- six_unlock_read(&prev->c.lock);
- if (!IS_ERR_OR_NULL(cur))
- six_unlock_read(&cur->c.lock);
-
- bch2_btree_and_journal_iter_exit(&iter);
-
- if (!ret && new_pass)
- goto again;
-
- BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
-
- bch2_bkey_buf_exit(&prev_k, c);
- bch2_bkey_buf_exit(&cur_k, c);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
- bool *reconstructed_root)
-{
- struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, btree);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- bch2_btree_id_to_text(&buf, btree);
-
- if (r->error) {
- bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
-
- ret = bch2_btree_has_scanned_nodes(c, btree);
- if (ret < 0)
- goto err;
-
- if (!ret) {
- __fsck_err(trans,
- FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
- btree_root_unreadable_and_scan_found_nothing,
- "no nodes found for btree %s, continue?", buf.buf);
-
- r->alive = false;
- r->error = 0;
- bch2_btree_root_alloc_fake_trans(trans, btree, 0);
- } else {
- r->alive = false;
- r->error = 0;
- bch2_btree_root_alloc_fake_trans(trans, btree, 1);
-
- bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
- if (ret)
- goto err;
- }
-
- *reconstructed_root = true;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_check_topology(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bpos pulled_from_scan = POS_MIN;
- int ret = 0;
-
- bch2_trans_srcu_unlock(trans);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- bool reconstructed_root = false;
-recover:
- ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
- if (ret)
- break;
-
- struct btree_root *r = bch2_btree_id_root(c, i);
- struct btree *b = r->b;
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
- six_unlock_read(&b->c.lock);
-
- if (ret == DROP_THIS_NODE) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- r->b = NULL;
-
- if (!reconstructed_root) {
- r->error = -EIO;
- goto recover;
- }
-
- struct printbuf buf = PRINTBUF;
- bch2_btree_id_to_text(&buf, i);
- bch_err(c, "empty btree root %s", buf.buf);
- printbuf_exit(&buf);
- bch2_btree_root_alloc_fake_trans(trans, i, 0);
- r->alive = false;
- ret = 0;
- }
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-/* marking of btree keys/nodes: */
-
-static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
- unsigned level, struct btree **prev,
- struct btree_iter *iter, struct bkey_s_c k,
- bool initial)
-{
- struct bch_fs *c = trans->c;
-
- if (iter) {
- struct btree_path *path = btree_iter_path(trans, iter);
- struct btree *b = path_l(path)->b;
-
- if (*prev != b) {
- int ret = bch2_btree_node_check_topology(trans, b);
- if (ret)
- return ret;
- }
- *prev = b;
- }
-
- struct bkey deleted = KEY(0, 0, 0);
- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- deleted.p = k.k->p;
-
- if (initial) {
- BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
- k.k->bversion.lo > atomic64_read(&c->journal.seq));
-
- if (fsck_err_on(btree_id != BTREE_ID_accounting &&
- k.k->bversion.lo > atomic64_read(&c->key_version),
- trans, bkey_version_in_future,
- "key version number higher than recorded %llu\n%s",
- atomic64_read(&c->key_version),
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- atomic64_set(&c->key_version, k.k->bversion.lo);
- }
-
- if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
- trans, btree_bitmap_not_marked,
- "btree ptr not marked in member info btree allocated bitmap\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- mutex_lock(&c->sb_lock);
- bch2_dev_btree_bitmap_mark(c, k);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- /*
- * We require a commit before key_trigger() because
- * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
- * wrong result if we run it multiple times.
- */
- unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
-
- ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
- BTREE_TRIGGER_check_repair|flags);
- if (ret)
- goto out;
-
- if (trans->nr_updates) {
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-
- ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
- BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
-out:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_gc_btree(struct btree_trans *trans,
- struct progress_indicator_state *progress,
- enum btree_id btree, bool initial)
-{
- struct bch_fs *c = trans->c;
- unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
- int ret = 0;
-
- /* We need to make sure every leaf node is readable before going RW */
- if (initial)
- target_depth = 0;
-
- for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
- struct btree *prev = NULL;
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
- BTREE_ITER_prefetch);
-
- ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
- gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
- bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
- }));
- if (ret)
- goto err;
- }
-
- /* root */
- do {
-retry_root:
- bch2_trans_begin(trans);
-
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
- 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
- struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err_root;
-
- if (b != btree_node_root(c, b)) {
- bch2_trans_iter_exit(trans, &iter);
- goto retry_root;
- }
-
- gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
-err_root:
- bch2_trans_iter_exit(trans, &iter);
- } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
-{
- return cmp_int(gc_btree_order(l), gc_btree_order(r));
-}
-
-static int bch2_gc_btrees(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct progress_indicator_state progress;
- bch2_progress_init(&progress, c, ~0ULL);
-
- enum btree_id ids[BTREE_ID_NR];
- for (unsigned i = 0; i < BTREE_ID_NR; i++)
- ids[i] = i;
- bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
-
- if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
- continue;
-
- ret = bch2_gc_btree(trans, &progress, btree, true);
- }
-
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_mark_superblocks(struct bch_fs *c)
-{
- gc_pos_set(c, gc_phase(GC_PHASE_sb));
-
- return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
-}
-
-static void bch2_gc_free(struct bch_fs *c)
-{
- bch2_accounting_gc_free(c);
-
- genradix_free(&c->reflink_gc_table);
- genradix_free(&c->gc_stripes);
-
- for_each_member_device(c, ca)
- genradix_free(&ca->buckets_gc);
-}
-
-static int bch2_gc_start(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- int ret = bch2_dev_usage_init(ca, true);
- if (ret) {
- bch2_dev_put(ca);
- return ret;
- }
- }
-
- return 0;
-}
-
-/* returns true if not equal */
-static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
- struct bch_alloc_v4 r)
-{
- return l.gen != r.gen ||
- l.oldest_gen != r.oldest_gen ||
- l.data_type != r.data_type ||
- l.dirty_sectors != r.dirty_sectors ||
- l.stripe_sectors != r.stripe_sectors ||
- l.cached_sectors != r.cached_sectors ||
- l.stripe_redundancy != r.stripe_redundancy ||
- l.stripe != r.stripe;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_dev *ca,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_alloc_v4 *a;
- struct bch_alloc_v4 old_gc, gc, old_convert, new;
- const struct bch_alloc_v4 *old;
- int ret;
-
- if (!bucket_valid(ca, k.k->p.offset))
- return 0;
-
- old = bch2_alloc_to_v4(k, &old_convert);
- gc = new = *old;
-
- __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
-
- old_gc = gc;
-
- if ((old->data_type == BCH_DATA_sb ||
- old->data_type == BCH_DATA_journal) &&
- !bch2_dev_is_online(ca)) {
- gc.data_type = old->data_type;
- gc.dirty_sectors = old->dirty_sectors;
- }
-
- /*
- * gc.data_type doesn't yet include need_discard & need_gc_gen states -
- * fix that here:
- */
- alloc_data_type_set(&gc, gc.data_type);
- if (gc.data_type != old_gc.data_type ||
- gc.dirty_sectors != old_gc.dirty_sectors) {
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
- if (ret)
- return ret;
-
- /*
- * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
- * safe w.r.t. transaction restarts, so fixup the gc_bucket so
- * we don't run it twice:
- */
- struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
- gc_m->data_type = gc.data_type;
- gc_m->dirty_sectors = gc.dirty_sectors;
- }
-
- if (fsck_err_on(new.data_type != gc.data_type,
- trans, alloc_key_data_type_wrong,
- "bucket %llu:%llu gen %u has wrong data_type"
- ": got %s, should be %s",
- iter->pos.inode, iter->pos.offset,
- gc.gen,
- bch2_data_type_str(new.data_type),
- bch2_data_type_str(gc.data_type)))
- new.data_type = gc.data_type;
-
-#define copy_bucket_field(_errtype, _f) \
- if (fsck_err_on(new._f != gc._f, \
- trans, _errtype, \
- "bucket %llu:%llu gen %u data type %s has wrong " #_f \
- ": got %llu, should be %llu", \
- iter->pos.inode, iter->pos.offset, \
- gc.gen, \
- bch2_data_type_str(gc.data_type), \
- (u64) new._f, (u64) gc._f)) \
- new._f = gc._f; \
-
- copy_bucket_field(alloc_key_gen_wrong, gen);
- copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
- copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
- copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
- copy_bucket_field(alloc_key_stripe_wrong, stripe);
- copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
-#undef copy_bucket_field
-
- if (!bch2_alloc_v4_cmp(*old, new))
- return 0;
-
- a = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
-
- a->v = new;
-
- /*
- * The trigger normally makes sure these are set, but we're not running
- * triggers:
- */
- if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
- a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
-
- ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
-fsck_err:
- return ret;
-}
-
-static int bch2_gc_alloc_done(struct bch_fs *c)
-{
- int ret = 0;
-
- for_each_member_device(c, ca) {
- ret = bch2_trans_run(c,
- for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- POS(ca->dev_idx, ca->mi.nbuckets - 1),
- BTREE_ITER_slots|BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_alloc_write_key(trans, &iter, ca, k)));
- if (ret) {
- bch2_dev_put(ca);
- break;
- }
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_gc_alloc_start(struct bch_fs *c)
-{
- int ret = 0;
-
- for_each_member_device(c, ca) {
- ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
- if (ret) {
- bch2_dev_put(ca);
- ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
- break;
- }
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_gc_write_stripes_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- const struct bch_stripe *s;
- struct gc_stripe *m;
- bool bad = false;
- unsigned i;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
-
- s = bkey_s_c_to_stripe(k).v;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
- for (i = 0; i < s->nr_blocks; i++) {
- u32 old = stripe_blockcount_get(s, i);
- u32 new = (m ? m->block_sectors[i] : 0);
-
- if (old != new) {
- prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
- i, old, new);
- bad = true;
- }
- }
-
- if (bad)
- bch2_bkey_val_to_text(&buf, c, k);
-
- if (fsck_err_on(bad,
- trans, stripe_sector_count_wrong,
- "%s", buf.buf)) {
- struct bkey_i_stripe *new;
-
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bkey_reassemble(&new->k_i, k);
-
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
- ret = bch2_trans_update(trans, iter, &new->k_i, 0);
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c)
-{
- return bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_gc_write_stripes_key(trans, &iter, k)));
-}
-
-/**
- * bch2_check_allocations - walk all references to buckets, and recompute them:
- *
- * @c: filesystem object
- *
- * Returns: 0 on success, or standard errcode on failure
- *
- * Order matters here:
- * - Concurrent GC relies on the fact that we have a total ordering for
- * everything that GC walks - see gc_will_visit_node(),
- * gc_will_visit_root()
- *
- * - also, references move around in the course of index updates and
- * various other crap: everything needs to agree on the ordering
- * references are allowed to move around in - e.g., we're allowed to
- * start with a reference owned by an open_bucket (the allocator) and
- * move it to the btree, but not the reverse.
- *
- * This is necessary to ensure that gc doesn't miss references that
- * move around - if references move backwards in the ordering GC
- * uses, GC could skip past them
- */
-int bch2_check_allocations(struct bch_fs *c)
-{
- int ret;
-
- down_read(&c->state_lock);
- down_write(&c->gc_lock);
-
- bch2_btree_interior_updates_flush(c);
-
- ret = bch2_gc_accounting_start(c) ?:
- bch2_gc_start(c) ?:
- bch2_gc_alloc_start(c) ?:
- bch2_gc_reflink_start(c);
- if (ret)
- goto out;
-
- gc_pos_set(c, gc_phase(GC_PHASE_start));
-
- ret = bch2_mark_superblocks(c);
- bch_err_msg(c, ret, "marking superblocks");
- if (ret)
- goto out;
-
- ret = bch2_gc_btrees(c);
- if (ret)
- goto out;
-
- c->gc_count++;
-
- ret = bch2_gc_alloc_done(c) ?:
- bch2_gc_accounting_done(c) ?:
- bch2_gc_stripes_done(c) ?:
- bch2_gc_reflink_done(c);
-out:
- percpu_down_write(&c->mark_lock);
- /* Indicates that gc is no longer in progress: */
- __gc_pos_set(c, gc_phase(GC_PHASE_not_running));
-
- bch2_gc_free(c);
- percpu_up_write(&c->mark_lock);
-
- up_write(&c->gc_lock);
- up_read(&c->state_lock);
-
- /*
- * At startup, allocations can happen directly instead of via the
- * allocator thread - issue wakeup in case they blocked on gc_lock:
- */
- closure_wake_up(&c->freelist_wait);
-
- if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
- bch2_sb_members_clean_deleted(c);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int gc_btree_gens_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
- return -EROFS;
-
- bool too_stale = false;
- scoped_guard(rcu) {
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- too_stale |= dev_ptr_stale(ca, ptr) > 16;
- }
-
- if (!too_stale)
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(*gen, ptr->gen))
- *gen = ptr->gen;
- }
- }
-
- if (too_stale) {
- struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
- int ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- bch2_extent_normalize(c, bkey_i_to_s(u));
- }
-
- return 0;
-}
-
-static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
- struct btree_iter *iter, struct bkey_s_c k)
-{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- struct bkey_i_alloc_v4 *a_mut;
- int ret;
-
- if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
- return 0;
-
- a_mut = bch2_alloc_to_v4_mut(trans, k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- return ret;
-
- a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
-
- return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
-}
-
-int bch2_gc_gens(struct bch_fs *c)
-{
- u64 b, start_time = local_clock();
- int ret;
-
- if (!mutex_trylock(&c->gc_gens_lock))
- return 0;
-
- trace_and_count(c, gc_gens_start, c);
-
- /*
- * We have to use trylock here. Otherwise, we would
- * introduce a deadlock in the RO path - we take the
- * state lock at the start of going RO.
- */
- if (!down_read_trylock(&c->state_lock)) {
- mutex_unlock(&c->gc_gens_lock);
- return 0;
- }
-
- for_each_member_device(c, ca) {
- struct bucket_gens *gens = bucket_gens(ca);
-
- BUG_ON(ca->oldest_gen);
-
- ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
- if (!ca->oldest_gen) {
- bch2_dev_put(ca);
- ret = bch_err_throw(c, ENOMEM_gc_gens);
- goto err;
- }
-
- for (b = gens->first_bucket;
- b < gens->nbuckets; b++)
- ca->oldest_gen[b] = gens->b[b];
- }
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++)
- if (btree_type_has_ptrs(i)) {
- c->gc_gens_btree = i;
- c->gc_gens_pos = POS_MIN;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, i,
- POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- gc_btree_gens_key(trans, &iter, k)));
- if (ret)
- goto err;
- }
-
- struct bch_dev *ca = NULL;
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN,
- BTREE_ITER_prefetch,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- if (!ca) {
- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
- bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
- })));
- bch2_dev_put(ca);
-
- if (ret)
- goto err;
-
- c->gc_gens_btree = 0;
- c->gc_gens_pos = POS_MIN;
-
- c->gc_count++;
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
- trace_and_count(c, gc_gens_end, c);
-err:
- for_each_member_device(c, ca) {
- kvfree(ca->oldest_gen);
- ca->oldest_gen = NULL;
- }
-
- up_read(&c->state_lock);
- mutex_unlock(&c->gc_gens_lock);
- if (!bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
- return ret;
-}
-
-static void bch2_gc_gens_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
- bch2_gc_gens(c);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_gc_gens_async(struct bch_fs *c)
-{
- if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
- !queue_work(c->write_ref_wq, &c->gc_gens_work))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
-}
-
-void bch2_fs_btree_gc_init_early(struct bch_fs *c)
-{
- seqcount_init(&c->gc_pos_lock);
- INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
-
- init_rwsem(&c->gc_lock);
- mutex_init(&c->gc_gens_lock);
-}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
deleted file mode 100644
index ec77662369a2..000000000000
--- a/fs/bcachefs/btree_gc.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_H
-#define _BCACHEFS_BTREE_GC_H
-
-#include "bkey.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-
-int bch2_check_topology(struct bch_fs *);
-int bch2_check_allocations(struct bch_fs *);
-
-/*
- * For concurrent mark and sweep (with other index updates), we define a total
- * ordering of _all_ references GC walks:
- *
- * Note that some references will have the same GC position as others - e.g.
- * everything within the same btree node; in those cases we're relying on
- * whatever locking exists for where those references live, i.e. the write lock
- * on a btree node.
- *
- * That locking is also required to ensure GC doesn't pass the updater in
- * between the updater adding/removing the reference and updating the GC marks;
- * without that, we would at best double count sometimes.
- *
- * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
- * be held that prevents GC from passing the position the updater is at.
- *
- * (What about the start of gc, when we're clearing all the marks? GC clears the
- * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
- * position inside its cmpxchg loop, so crap magically works).
- */
-
-/* Position of (the start of) a gc phase: */
-static inline struct gc_pos gc_phase(enum gc_phase phase)
-{
- return (struct gc_pos) { .phase = phase, };
-}
-
-static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
- struct bpos pos)
-{
- return (struct gc_pos) {
- .phase = GC_PHASE_btree,
- .btree = btree,
- .level = level,
- .pos = pos,
- };
-}
-
-static inline int gc_btree_order(enum btree_id btree)
-{
- if (btree == BTREE_ID_alloc)
- return -2;
- if (btree == BTREE_ID_stripes)
- return -1;
- return btree;
-}
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
- return cmp_int(l.phase, r.phase) ?:
- cmp_int(gc_btree_order(l.btree),
- gc_btree_order(r.btree)) ?:
- cmp_int(l.level, r.level) ?:
- bpos_cmp(l.pos, r.pos);
-}
-
-static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
-{
- unsigned seq;
- bool ret;
-
- do {
- seq = read_seqcount_begin(&c->gc_pos_lock);
- ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
- return ret;
-}
-
-void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
-
-int bch2_gc_gens(struct bch_fs *);
-void bch2_gc_gens_async(struct bch_fs *);
-
-void bch2_fs_btree_gc_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
deleted file mode 100644
index c24dd6edf377..000000000000
--- a/fs/bcachefs/btree_gc_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_GC_TYPES_H
-#define _BCACHEFS_BTREE_GC_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-#define GC_PHASES() \
- x(not_running) \
- x(start) \
- x(sb) \
- x(btree)
-
-enum gc_phase {
-#define x(n) GC_PHASE_##n,
- GC_PHASES()
-#undef x
-};
-
-struct gc_pos {
- enum gc_phase phase:8;
- enum btree_id btree:8;
- u16 level;
- struct bpos pos;
-};
-
-struct reflink_gc {
- u64 offset;
- u32 size;
- u32 refcount;
-};
-
-typedef GENRADIX(struct reflink_gc) reflink_gc_table;
-
-#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
deleted file mode 100644
index 590cd29f3e86..000000000000
--- a/fs/bcachefs/btree_io.c
+++ /dev/null
@@ -1,2742 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "async_objs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "debug.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "recovery.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
-{
- bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
- prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
- prt_str(out, "min: ");
- bch2_bpos_to_text(out, bn->min_key);
- prt_newline(out);
- prt_str(out, "max: ");
- bch2_bpos_to_text(out, bn->max_key);
-}
-
-void bch2_btree_node_io_unlock(struct btree *b)
-{
- EBUG_ON(!btree_node_write_in_flight(b));
-
- clear_btree_node_write_in_flight_inner(b);
- clear_btree_node_write_in_flight(b);
- smp_mb__after_atomic();
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-void bch2_btree_node_io_lock(struct btree *b)
-{
- wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_read(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void __bch2_btree_node_wait_on_write(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_read(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-void bch2_btree_node_wait_on_write(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-static void verify_no_dups(struct btree *b,
- struct bkey_packed *start,
- struct bkey_packed *end)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bkey_packed *k, *p;
-
- if (start == end)
- return;
-
- for (p = start, k = bkey_p_next(start);
- k != end;
- p = k, k = bkey_p_next(k)) {
- struct bkey l = bkey_unpack_key(b, p);
- struct bkey r = bkey_unpack_key(b, k);
-
- BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
- }
-#endif
-}
-
-static void set_needs_whiteout(struct bset *i, int v)
-{
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
- k->needs_whiteout = v;
-}
-
-static void btree_bounce_free(struct bch_fs *c, size_t size,
- bool used_mempool, void *p)
-{
- if (used_mempool)
- mempool_free(p, &c->btree_bounce_pool);
- else
- kvfree(p);
-}
-
-static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
- bool *used_mempool)
-{
- unsigned flags = memalloc_nofs_save();
- void *p;
-
- BUG_ON(size > c->opts.btree_node_size);
-
- *used_mempool = false;
- p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
- if (!p) {
- *used_mempool = true;
- p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
- }
- memalloc_nofs_restore(flags);
- return p;
-}
-
-static void sort_bkey_ptrs(const struct btree *bt,
- struct bkey_packed **ptrs, unsigned nr)
-{
- unsigned n = nr, a = nr / 2, b, c, d;
-
- if (!a)
- return;
-
- /* Heap sort: see lib/sort.c: */
- while (1) {
- if (a)
- a--;
- else if (--n)
- swap(ptrs[0], ptrs[n]);
- else
- break;
-
- for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
- b = bch2_bkey_cmp_packed(bt,
- ptrs[c],
- ptrs[d]) >= 0 ? c : d;
- if (d == n)
- b = c;
-
- while (b != a &&
- bch2_bkey_cmp_packed(bt,
- ptrs[a],
- ptrs[b]) >= 0)
- b = (b - 1) / 2;
- c = b;
- while (b != a) {
- b = (b - 1) / 2;
- swap(ptrs[b], ptrs[c]);
- }
- }
-}
-
-static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
-{
- struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
- bool used_mempool = false;
- size_t bytes = b->whiteout_u64s * sizeof(u64);
-
- if (!b->whiteout_u64s)
- return;
-
- new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-
- ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
-
- for (k = unwritten_whiteouts_start(b);
- k != unwritten_whiteouts_end(b);
- k = bkey_p_next(k))
- *--ptrs = k;
-
- sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
-
- k = new_whiteouts;
-
- while (ptrs != ptrs_end) {
- bkey_p_copy(k, *ptrs);
- k = bkey_p_next(k);
- ptrs++;
- }
-
- verify_no_dups(b, new_whiteouts,
- (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
-
- memcpy_u64s(unwritten_whiteouts_start(b),
- new_whiteouts, b->whiteout_u64s);
-
- btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
-}
-
-static bool should_compact_bset(struct btree *b, struct bset_tree *t,
- bool compacting, enum compact_mode mode)
-{
- if (!bset_dead_u64s(b, t))
- return false;
-
- switch (mode) {
- case COMPACT_LAZY:
- return should_compact_bset_lazy(b, t) ||
- (compacting && !bset_written(b, bset(b, t)));
- case COMPACT_ALL:
- return true;
- default:
- BUG();
- }
-}
-
-static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
-{
- bool ret = false;
-
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k, *n, *out, *start, *end;
- struct btree_node_entry *src = NULL, *dst = NULL;
-
- if (t != b->set && !bset_written(b, i)) {
- src = container_of(i, struct btree_node_entry, keys);
- dst = max(write_block(b),
- (void *) btree_bkey_last(b, t - 1));
- }
-
- if (src != dst)
- ret = true;
-
- if (!should_compact_bset(b, t, ret, mode)) {
- if (src != dst) {
- memmove(dst, src, sizeof(*src) +
- le16_to_cpu(src->keys.u64s) *
- sizeof(u64));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
- continue;
- }
-
- start = btree_bkey_first(b, t);
- end = btree_bkey_last(b, t);
-
- if (src != dst) {
- memmove(dst, src, sizeof(*src));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
-
- out = i->start;
-
- for (k = start; k != end; k = n) {
- n = bkey_p_next(k);
-
- if (!bkey_deleted(k)) {
- bkey_p_copy(out, k);
- out = bkey_p_next(out);
- } else {
- BUG_ON(k->needs_whiteout);
- }
- }
-
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
- ret = true;
- }
-
- bch2_verify_btree_nr_keys(b);
-
- bch2_btree_build_aux_trees(b);
-
- return ret;
-}
-
-bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
- enum compact_mode mode)
-{
- return bch2_drop_whiteouts(b, mode);
-}
-
-static void btree_node_sort(struct bch_fs *c, struct btree *b,
- unsigned start_idx,
- unsigned end_idx)
-{
- struct btree_node *out;
- struct sort_iter_stack sort_iter;
- struct bset_tree *t;
- struct bset *start_bset = bset(b, &b->set[start_idx]);
- bool used_mempool = false;
- u64 start_time, seq = 0;
- unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
- bool sorting_entire_node = start_idx == 0 &&
- end_idx == b->nsets;
-
- sort_iter_stack_init(&sort_iter, b);
-
- for (t = b->set + start_idx;
- t < b->set + end_idx;
- t++) {
- u64s += le16_to_cpu(bset(b, t)->u64s);
- sort_iter_add(&sort_iter.iter,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- }
-
- bytes = sorting_entire_node
- ? btree_buf_bytes(b)
- : __vstruct_bytes(struct btree_node, u64s);
-
- out = btree_bounce_alloc(c, bytes, &used_mempool);
-
- start_time = local_clock();
-
- u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
-
- out->keys.u64s = cpu_to_le16(u64s);
-
- BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
-
- if (sorting_entire_node)
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
- start_time);
-
- /* Make sure we preserve bset journal_seq: */
- for (t = b->set + start_idx; t < b->set + end_idx; t++)
- seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
- start_bset->journal_seq = cpu_to_le64(seq);
-
- if (sorting_entire_node) {
- u64s = le16_to_cpu(out->keys.u64s);
-
- BUG_ON(bytes != btree_buf_bytes(b));
-
- /*
- * Our temporary buffer is the same size as the btree node's
- * buffer, we can just swap buffers instead of doing a big
- * memcpy()
- */
- *out = *b->data;
- out->keys.u64s = cpu_to_le16(u64s);
- swap(out, b->data);
- set_btree_bset(b, b->set, &b->data->keys);
- } else {
- start_bset->u64s = out->keys.u64s;
- memcpy_u64s(start_bset->start,
- out->keys.start,
- le16_to_cpu(out->keys.u64s));
- }
-
- for (i = start_idx + 1; i < end_idx; i++)
- b->nr.bset_u64s[start_idx] +=
- b->nr.bset_u64s[i];
-
- b->nsets -= shift;
-
- for (i = start_idx + 1; i < b->nsets; i++) {
- b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift];
- b->set[i] = b->set[i + shift];
- }
-
- for (i = b->nsets; i < MAX_BSETS; i++)
- b->nr.bset_u64s[i] = 0;
-
- set_btree_bset_end(b, &b->set[start_idx]);
- bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
-
- btree_bounce_free(c, bytes, used_mempool, out);
-
- bch2_verify_btree_nr_keys(b);
-}
-
-void bch2_btree_sort_into(struct bch_fs *c,
- struct btree *dst,
- struct btree *src)
-{
- struct btree_nr_keys nr;
- struct btree_node_iter src_iter;
- u64 start_time = local_clock();
-
- BUG_ON(dst->nsets != 1);
-
- bch2_bset_set_no_aux_tree(dst, dst->set);
-
- bch2_btree_node_iter_init_from_start(&src_iter, src);
-
- nr = bch2_sort_repack(btree_bset_first(dst),
- src, &src_iter,
- &dst->format,
- true);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
- start_time);
-
- set_btree_bset_end(dst, dst->set);
-
- dst->nr.live_u64s += nr.live_u64s;
- dst->nr.bset_u64s[0] += nr.bset_u64s[0];
- dst->nr.packed_keys += nr.packed_keys;
- dst->nr.unpacked_keys += nr.unpacked_keys;
-
- bch2_verify_btree_nr_keys(dst);
-}
-
-/*
- * We're about to add another bset to the btree node, so if there's currently
- * too many bsets - sort some of them together:
- */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b)
-{
- unsigned unwritten_idx;
- bool ret = false;
-
- for (unwritten_idx = 0;
- unwritten_idx < b->nsets;
- unwritten_idx++)
- if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
- break;
-
- if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, unwritten_idx, b->nsets);
- ret = true;
- }
-
- if (unwritten_idx > 1) {
- btree_node_sort(c, b, 0, unwritten_idx);
- ret = true;
- }
-
- return ret;
-}
-
-void bch2_btree_build_aux_trees(struct btree *b)
-{
- for_each_bset(b, t)
- bch2_bset_build_aux_tree(b, t,
- !bset_written(b, bset(b, t)) &&
- t == bset_tree_last(b));
-}
-
-/*
- * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
- *
- * The first bset is going to be of similar order to the size of the node, the
- * last bset is bounded by btree_write_set_buffer(), which is set to keep the
- * memmove on insert from being too expensive: the middle bset should, ideally,
- * be the geometric mean of the first and the last.
- *
- * Returns true if the middle bset is greater than that geometric mean:
- */
-static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
-{
- unsigned mid_u64s_bits =
- (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
-
- return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
-}
-
-/*
- * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
- * inserted into
- *
- * Safe to call if there already is an unwritten bset - will only add a new bset
- * if @b doesn't already have one.
- *
- * Returns true if we sorted (i.e. invalidated iterators
- */
-void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
- struct btree_node_entry *bne;
- bool reinit_iter = false;
-
- EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
- BUG_ON(bset_written(b, bset(b, &b->set[1])));
- BUG_ON(btree_node_just_written(b));
-
- if (b->nsets == MAX_BSETS &&
- !btree_node_write_in_flight(b) &&
- should_compact_all(c, b)) {
- bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
- BTREE_WRITE_init_next_bset);
- reinit_iter = true;
- }
-
- if (b->nsets == MAX_BSETS &&
- btree_node_compact(c, b))
- reinit_iter = true;
-
- BUG_ON(b->nsets >= MAX_BSETS);
-
- bne = want_new_bset(c, b);
- if (bne)
- bch2_bset_init_next(b, bne);
-
- bch2_btree_build_aux_trees(b);
-
- if (reinit_iter)
- bch2_trans_node_reinit_iter(trans, b);
-}
-
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
- struct bch_dev *ca,
- bool print_pos,
- struct btree *b, struct bset *i, struct bkey_packed *k,
- unsigned offset, int rw)
-{
- if (print_pos) {
- prt_str(out, rw == READ
- ? "error validating btree node "
- : "corrupt btree node before write ");
- prt_printf(out, "at btree ");
- bch2_btree_pos_to_text(out, c, b);
- prt_newline(out);
- }
-
- if (ca)
- prt_printf(out, "%s ", ca->name);
-
- prt_printf(out, "node offset %u/%u",
- b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
- if (i)
- prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
- if (k)
- prt_printf(out, " bset byte offset %lu",
- (unsigned long)(void *)k -
- ((unsigned long)(void *)i & ~511UL));
- prt_str(out, ": ");
-}
-
-__printf(11, 12)
-static int __btree_err(int ret,
- struct bch_fs *c,
- struct bch_dev *ca,
- struct btree *b,
- struct bset *i,
- struct bkey_packed *k,
- int rw,
- enum bch_sb_error_id err_type,
- struct bch_io_failures *failed,
- struct printbuf *err_msg,
- const char *fmt, ...)
-{
- if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
- return ret == -BCH_ERR_btree_node_read_err_fixable
- ? bch_err_throw(c, fsck_fix)
- : ret;
-
- bool have_retry = false;
- int ret2;
-
- if (ca) {
- bch2_mark_btree_validate_failure(failed, ca->dev_idx);
-
- struct extent_ptr_decoded pick;
- have_retry = bch2_bkey_pick_read_device(c,
- bkey_i_to_s_c(&b->key),
- failed, &pick, -1) == 1;
- }
-
- if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
- ret = bch_err_throw(c, btree_node_read_err_fixable);
- if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
- ret = bch_err_throw(c, btree_node_read_err_bad_node);
-
- bch2_sb_error_count(c, err_type);
-
- bool print_deferred = err_msg &&
- rw == READ &&
- !(test_bit(BCH_FS_in_fsck, &c->flags) &&
- c->opts.fix_errors == FSCK_FIX_ask);
-
- struct printbuf out = PRINTBUF;
- bch2_log_msg_start(c, &out);
-
- if (!print_deferred)
- err_msg = &out;
-
- btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(err_msg, fmt, args);
- va_end(args);
-
- if (print_deferred) {
- prt_newline(err_msg);
-
- switch (ret) {
- case -BCH_ERR_btree_node_read_err_fixable:
- ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
- if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
- !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
- ret = ret2;
- goto fsck_err;
- }
-
- if (!have_retry)
- ret = bch_err_throw(c, fsck_fix);
- goto out;
- case -BCH_ERR_btree_node_read_err_bad_node:
- prt_str(&out, ", ");
- break;
- }
-
- goto out;
- }
-
- if (rw == WRITE) {
- prt_str(&out, ", ");
- ret = __bch2_inconsistent_error(c, &out)
- ? -BCH_ERR_fsck_errors_not_fixed
- : 0;
- goto print;
- }
-
- switch (ret) {
- case -BCH_ERR_btree_node_read_err_fixable:
- ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
- if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
- !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
- ret = ret2;
- goto fsck_err;
- }
-
- if (!have_retry)
- ret = bch_err_throw(c, fsck_fix);
- goto out;
- case -BCH_ERR_btree_node_read_err_bad_node:
- prt_str(&out, ", ");
- break;
- }
-print:
- bch2_print_str(c, KERN_ERR, out.buf);
-out:
-fsck_err:
- printbuf_exit(&out);
- return ret;
-}
-
-#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
-({ \
- int _ret = __btree_err(type, c, ca, b, i, k, write, \
- BCH_FSCK_ERR_##_err_type, \
- failed, err_msg, \
- msg, ##__VA_ARGS__); \
- \
- if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \
- ret = _ret; \
- goto fsck_err; \
- } \
- \
- true; \
-})
-
-#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
-
-/*
- * When btree topology repair changes the start or end of a node, that might
- * mean we have to drop keys that are no longer inside the node:
- */
-__cold
-void bch2_btree_node_drop_keys_outside_node(struct btree *b)
-{
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
- if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
- break;
-
- if (k != i->start) {
- unsigned shift = (u64 *) k - (u64 *) i->start;
-
- memmove_u64s_down(i->start, k,
- (u64 *) vstruct_end(i) - (u64 *) k);
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
- set_btree_bset_end(b, t);
- }
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
- if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
- break;
-
- if (k != vstruct_last(i)) {
- i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
- set_btree_bset_end(b, t);
- }
- }
-
- /*
- * Always rebuild search trees: eytzinger search tree nodes directly
- * depend on the values of min/max key:
- */
- bch2_bset_set_no_aux_tree(b, b->set);
- bch2_btree_build_aux_trees(b);
- b->nr = bch2_btree_node_count_keys(b);
-
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
- for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
- BUG_ON(bpos_lt(k.k->p, b->data->min_key));
- BUG_ON(bpos_gt(k.k->p, b->data->max_key));
- }
-}
-
-static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, struct bset *i,
- unsigned offset, int write,
- struct bch_io_failures *failed,
- struct printbuf *err_msg)
-{
- unsigned version = le16_to_cpu(i->version);
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- int ret = 0;
-
- btree_err_on(!bch2_version_compatible(version),
- -BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i, NULL,
- btree_node_unsupported_version,
- "unsupported bset version %u.%u",
- BCH_VERSION_MAJOR(version),
- BCH_VERSION_MINOR(version));
-
- if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
- btree_err_on(version < c->sb.version_min,
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, NULL,
- btree_node_bset_older_than_sb_min,
- "bset version %u older than superblock version_min %u",
- version, c->sb.version_min)) {
- if (bch2_version_compatible(version)) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version_min = cpu_to_le16(version);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- } else {
- /* We have no idea what's going on: */
- i->version = cpu_to_le16(c->sb.version);
- }
- }
-
- if (btree_err_on(BCH_VERSION_MAJOR(version) >
- BCH_VERSION_MAJOR(c->sb.version),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, NULL,
- btree_node_bset_newer_than_sb,
- "bset version %u newer than superblock version %u",
- version, c->sb.version)) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version = cpu_to_le16(version);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
- -BCH_ERR_btree_node_read_err_incompatible,
- c, ca, b, i, NULL,
- btree_node_unsupported_version,
- "BSET_SEPARATE_WHITEOUTS no longer supported");
-
- btree_err_on(offset && !i->u64s,
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_empty,
- "empty bset");
-
- btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_wrong_sector_offset,
- "bset at wrong sector offset");
-
- if (!offset) {
- struct btree_node *bn =
- container_of(i, struct btree_node, keys);
- /* These indicate that we read the wrong btree node: */
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *bp =
- &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
- /* XXX endianness */
- btree_err_on(bp->seq != bn->keys.seq,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- bset_bad_seq,
- "incorrect sequence number (wrong btree node)");
- }
-
- btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i, NULL,
- btree_node_bad_btree,
- "incorrect btree id");
-
- btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i, NULL,
- btree_node_bad_level,
- "incorrect level");
-
- if (!write)
- compat_btree_node(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write, bn);
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *bp =
- &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
- if (BTREE_PTR_RANGE_UPDATED(bp)) {
- b->data->min_key = bp->min_key;
- b->data->max_key = b->key.k.p;
- }
-
- btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_min_key,
- "incorrect min_key: got %s should be %s",
- (printbuf_reset(&buf1),
- bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
- (printbuf_reset(&buf2),
- bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
- }
-
- btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, i, NULL,
- btree_node_bad_max_key,
- "incorrect max key %s",
- (printbuf_reset(&buf1),
- bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
-
- if (write)
- compat_btree_node(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write, bn);
-
- btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
- -BCH_ERR_btree_node_read_err_bad_node,
- c, ca, b, i, NULL,
- btree_node_bad_format,
- "invalid bkey format: %s\n%s", buf1.buf,
- (printbuf_reset(&buf2),
- bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
- printbuf_reset(&buf1);
-
- compat_bformat(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write,
- &bn->format);
- }
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- enum bch_validate_flags flags)
-{
- return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = flags
- });
-}
-
-static int bset_key_validate(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- bool updated_range,
- enum bch_validate_flags flags)
-{
- struct bkey_validate_context from = (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = flags,
- };
- return __bch2_bkey_validate(c, k, from) ?:
- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
- (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
-}
-
-static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
- struct bset *i, struct bkey_packed *k)
-{
- if (bkey_p_next(k) > vstruct_last(i))
- return false;
-
- if (k->format > KEY_FORMAT_CURRENT)
- return false;
-
- if (!bkeyp_u64s_valid(&b->format, k))
- return false;
-
- struct bkey tmp;
- struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- return !__bch2_bkey_validate(c, u.s_c,
- (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = BCH_VALIDATE_silent
- });
-}
-
-static inline int btree_node_read_bkey_cmp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r)
-{
- return bch2_bkey_cmp_packed(b, l, r)
- ?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
-}
-
-static int validate_bset_keys(struct bch_fs *c, struct btree *b,
- struct bset *i, int write,
- struct bch_io_failures *failed,
- struct printbuf *err_msg)
-{
- unsigned version = le16_to_cpu(i->version);
- struct bkey_packed *k, *prev = NULL;
- struct printbuf buf = PRINTBUF;
- bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- int ret = 0;
-
- for (k = i->start;
- k != vstruct_last(i);) {
- struct bkey_s u;
- struct bkey tmp;
- unsigned next_good_key;
-
- if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_past_bset_end,
- "key extends past end of bset")) {
- i->u64s = cpu_to_le16((u64 *) k - i->_data);
- break;
- }
-
- if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_bad_format,
- "invalid bkey format %u", k->format))
- goto drop_this_key;
-
- if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_bad_u64s,
- "bad k->u64s %u (min %u max %zu)", k->u64s,
- bkeyp_key_u64s(&b->format, k),
- U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
- goto drop_this_key;
-
- if (!write)
- bch2_bkey_compat(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write,
- &b->format, k);
-
- u = __bkey_disassemble(b, k, &tmp);
-
- ret = bset_key_validate(c, b, u.s_c, updated_range, write);
- if (ret == -BCH_ERR_fsck_delete_bkey)
- goto drop_this_key;
- if (ret)
- goto fsck_err;
-
- if (write)
- bch2_bkey_compat(b->c.level, b->c.btree_id, version,
- BSET_BIG_ENDIAN(i), write,
- &b->format, k);
-
- if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
- struct bkey up = bkey_unpack_key(b, prev);
-
- printbuf_reset(&buf);
- prt_printf(&buf, "keys out of order: ");
- bch2_bkey_to_text(&buf, &up);
- prt_printf(&buf, " > ");
- bch2_bkey_to_text(&buf, u.k);
-
- if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bkey_out_of_order,
- "%s", buf.buf))
- goto drop_this_key;
- }
-
- prev = k;
- k = bkey_p_next(k);
- continue;
-drop_this_key:
- next_good_key = k->u64s;
-
- if (!next_good_key ||
- (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
- version >= bcachefs_metadata_version_snapshot)) {
- /*
- * only do scanning if bch2_bkey_compat() has nothing to
- * do
- */
-
- if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
- for (next_good_key = 1;
- next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
- next_good_key++)
- if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
- goto got_good_key;
- }
-
- /*
- * didn't find a good key, have to truncate the rest of
- * the bset
- */
- next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
- }
-got_good_key:
- le16_add_cpu(&i->u64s, -next_good_key);
- memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_error(b);
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b,
- struct bch_io_failures *failed,
- struct printbuf *err_msg)
-{
- struct btree_node_entry *bne;
- struct sort_iter *iter;
- struct btree_node *sorted;
- struct bkey_packed *k;
- struct bset *i;
- bool used_mempool, blacklisted;
- bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
- u64 max_journal_seq = 0;
- struct printbuf buf = PRINTBUF;
- int ret = 0, write = READ;
- u64 start_time = local_clock();
-
- b->version_ondisk = U16_MAX;
- /* We might get called multiple times on read retry: */
- b->written = 0;
-
- iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
- sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
-
- if (bch2_meta_read_fault("btree"))
- btree_err(-BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_fault_injected,
- "dynamic fault");
-
- btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_magic,
- "bad magic: want %llx, got %llx",
- bset_magic(c), le64_to_cpu(b->data->magic));
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *bp =
- &bkey_i_to_btree_ptr_v2(&b->key)->v;
-
- bch2_bpos_to_text(&buf, b->data->min_key);
- prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, b->data->max_key);
-
- btree_err_on(b->data->keys.seq != bp->seq,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_seq,
- "got wrong btree node: got\n%s",
- (printbuf_reset(&buf),
- bch2_btree_node_header_to_text(&buf, b->data),
- buf.buf));
- } else {
- btree_err_on(!b->data->keys.seq,
- -BCH_ERR_btree_node_read_err_must_retry,
- c, ca, b, NULL, NULL,
- btree_node_bad_seq,
- "bad btree header: seq 0\n%s",
- (printbuf_reset(&buf),
- bch2_btree_node_header_to_text(&buf, b->data),
- buf.buf));
- }
-
- while (b->written < (ptr_written ?: btree_sectors(c))) {
- unsigned sectors;
- bool first = !b->written;
-
- if (first) {
- bne = NULL;
- i = &b->data->keys;
- } else {
- bne = write_block(b);
- i = &bne->keys;
-
- if (i->seq != b->data->keys.seq)
- break;
- }
-
- struct nonce nonce = btree_nonce(i, b->written << 9);
- bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
- btree_err_on(!good_csum_type,
- bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
- ? -BCH_ERR_btree_node_read_err_must_retry
- : -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_unknown_csum,
- "unknown checksum type %llu", BSET_CSUM_TYPE(i));
-
- if (first) {
- sectors = vstruct_sectors(b->data, c->block_bits);
- if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_past_end_of_btree_node,
- "bset past end of btree node (offset %u len %u but written %zu)",
- b->written, sectors, ptr_written ?: btree_sectors(c)))
- i->u64s = 0;
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
- bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
- if (csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "decrypting btree node: %s", bch2_err_str(ret)))
- goto fsck_err;
- }
-
- btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
- -BCH_ERR_btree_node_read_err_incompatible,
- c, NULL, b, NULL, NULL,
- btree_node_unsupported_version,
- "btree node does not have NEW_EXTENT_OVERWRITE set");
- } else {
- sectors = vstruct_sectors(bne, c->block_bits);
- if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_past_end_of_btree_node,
- "bset past end of btree node (offset %u len %u but written %zu)",
- b->written, sectors, ptr_written ?: btree_sectors(c)))
- i->u64s = 0;
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- bool csum_bad = bch2_crc_cmp(bne->csum, csum);
- if (ca && csum_bad)
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
-
- btree_err_on(csum_bad,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, i, NULL,
- bset_bad_csum,
- "%s",
- (printbuf_reset(&buf),
- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
- buf.buf));
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "decrypting btree node: %s", bch2_err_str(ret)))
- goto fsck_err;
- }
- }
-
- b->version_ondisk = min(b->version_ondisk,
- le16_to_cpu(i->version));
-
- ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
- if (ret)
- goto fsck_err;
-
- if (!b->written)
- btree_node_set_format(b, b->data->format);
-
- ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
- if (ret)
- goto fsck_err;
-
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- blacklisted = bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(i->journal_seq),
- true);
-
- btree_err_on(blacklisted && first,
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_blacklisted_journal_seq,
- "first btree node bset has blacklisted journal seq (%llu)",
- le64_to_cpu(i->journal_seq));
-
- btree_err_on(blacklisted && ptr_written,
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- first_bset_blacklisted_journal_seq,
- "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
- le64_to_cpu(i->journal_seq),
- b->written, b->written + sectors, ptr_written);
-
- b->written = min(b->written + sectors, btree_sectors(c));
-
- if (blacklisted && !first)
- continue;
-
- sort_iter_add(iter,
- vstruct_idx(i, 0),
- vstruct_last(i));
-
- max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
- }
-
- if (ptr_written) {
- btree_err_on(b->written < ptr_written,
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL, NULL,
- btree_node_data_missing,
- "btree node data missing: expected %u sectors, found %u",
- ptr_written, b->written);
- } else {
- for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_buf_bytes(b);
- bne = (void *) bne + block_bytes(c))
- btree_err_on(bne->keys.seq == b->data->keys.seq &&
- !bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq),
- true),
- -BCH_ERR_btree_node_read_err_want_retry,
- c, ca, b, NULL, NULL,
- btree_node_bset_after_end,
- "found bset signature after last bset");
- }
-
- sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
- sorted->keys.u64s = 0;
-
- b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
- memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
- btree_buf_bytes(b) -
- sizeof(struct btree_node) -
- b->nr.live_u64s * sizeof(u64));
-
- b->data->keys.u64s = sorted->keys.u64s;
- *sorted = *b->data;
- swap(sorted, b->data);
- set_btree_bset(b, b->set, &b->data->keys);
- b->nsets = 1;
- b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
-
- BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
-
- btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
-
- i = &b->data->keys;
- for (k = i->start; k != vstruct_last(i);) {
- struct bkey tmp;
- struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-
- ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
- if (ret == -BCH_ERR_fsck_delete_bkey ||
- (static_branch_unlikely(&bch2_inject_invalid_keys) &&
- !bversion_cmp(u.k->bversion, MAX_VERSION))) {
- btree_keys_account_key_drop(&b->nr, 0, k);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- set_btree_bset_end(b, b->set);
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_error(b);
- continue;
- }
- if (ret)
- goto fsck_err;
-
- if (u.k->type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
-
- bp.v->mem_ptr = 0;
- }
-
- k = bkey_p_next(k);
- }
-
- bch2_bset_build_aux_tree(b, b->set, false);
-
- set_needs_whiteout(btree_bset_first(b), true);
-
- btree_node_reset_sib_u64s(b);
-
- if (updated_range)
- bch2_btree_node_drop_keys_outside_node(b);
-
- /*
- * XXX:
- *
- * We deadlock if too many btree updates require node rewrites while
- * we're still in journal replay.
- *
- * This is because btree node rewrites generate more updates for the
- * interior updates (alloc, backpointers), and if those updates touch
- * new nodes and generate more rewrites - well, you see the problem.
- *
- * The biggest cause is that we don't use the btree write buffer (for
- * the backpointer updates - this needs some real thought on locking in
- * order to fix.
- *
- * The problem with this workaround (not doing the rewrite for degraded
- * nodes in journal replay) is that those degraded nodes persist, and we
- * don't want that (this is a real bug when a btree node write completes
- * with fewer replicas than we wanted and leaves a degraded node due to
- * device _removal_, i.e. the device went away mid write).
- *
- * It's less of a bug here, but still a problem because we don't yet
- * have a way of tracking degraded data - we another index (all
- * extents/btree nodes, by replicas entry) in order to fix properly
- * (re-replicate degraded data at the earliest possible time).
- */
- if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
- }
- }
- }
-
- if (!ptr_written) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_ptr_written_zero(b);
- }
-fsck_err:
- mempool_free(iter, &c->fill_iter);
- printbuf_exit(&buf);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
- return ret;
-}
-
-static void btree_node_read_work(struct work_struct *work)
-{
- struct btree_read_bio *rb =
- container_of(work, struct btree_read_bio, work);
- struct bch_fs *c = rb->c;
- struct bch_dev *ca = rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
- struct btree *b = rb->b;
- struct bio *bio = &rb->bio;
- struct bch_io_failures failed = { .nr = 0 };
- int ret = 0;
-
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "btree node read error at btree ");
- bch2_btree_pos_to_text(&buf, c, b);
- prt_newline(&buf);
-
- goto start;
- while (1) {
- ret = bch2_bkey_pick_read_device(c,
- bkey_i_to_s_c(&b->key),
- &failed, &rb->pick, -1);
- if (ret <= 0) {
- set_btree_node_read_error(b);
- break;
- }
-
- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
- rb->have_ioref = ca != NULL;
- rb->start_time = local_clock();
- bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_buf_bytes(b);
-
- if (rb->have_ioref) {
- bio_set_dev(bio, ca->disk_sb.bdev);
- submit_bio_wait(bio);
- } else {
- bio->bi_status = BLK_STS_REMOVED;
- }
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- rb->start_time, !bio->bi_status);
-start:
- if (rb->have_ioref)
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
- rb->have_ioref = false;
-
- if (bio->bi_status) {
- bch2_mark_io_failure(&failed, &rb->pick, false);
- continue;
- }
-
- ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry)
- continue;
-
- if (ret)
- set_btree_node_read_error(b);
-
- break;
- }
-
- bch2_io_failures_to_text(&buf, c, &failed);
-
- if (btree_node_read_error(b))
- bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
- /*
- * only print retry success if we read from a replica with no errors
- */
- if (btree_node_read_error(b))
- prt_printf(&buf, "ret %s", bch2_err_str(ret));
- else if (failed.nr) {
- if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
- prt_printf(&buf, "retry success");
- else
- prt_printf(&buf, "repair success");
- }
-
- if ((failed.nr ||
- btree_node_need_rewrite(b)) &&
- !btree_node_read_error(b) &&
- c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
- prt_printf(&buf, " (rewriting node)");
- bch2_btree_node_rewrite_async(c, b);
- }
- prt_newline(&buf);
-
- if (failed.nr)
- bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
-
- async_object_list_del(c, btree_read_bio, rb->list_idx);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
- rb->start_time);
- bio_put(&rb->bio);
- printbuf_exit(&buf);
- clear_btree_node_read_in_flight(b);
- smp_mb__after_atomic();
- wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_endio(struct bio *bio)
-{
- struct btree_read_bio *rb =
- container_of(bio, struct btree_read_bio, bio);
- struct bch_fs *c = rb->c;
- struct bch_dev *ca = rb->have_ioref
- ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- rb->start_time, !bio->bi_status);
-
- queue_work(c->btree_read_complete_wq, &rb->work);
-}
-
-void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
-{
- bch2_bio_to_text(out, &rbio->bio);
-}
-
-struct btree_node_read_all {
- struct closure cl;
- struct bch_fs *c;
- struct btree *b;
- unsigned nr;
- void *buf[BCH_REPLICAS_MAX];
- struct bio *bio[BCH_REPLICAS_MAX];
- blk_status_t err[BCH_REPLICAS_MAX];
-};
-
-static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
-{
- struct btree_node *bn = data;
- struct btree_node_entry *bne;
- unsigned offset = 0;
-
- if (le64_to_cpu(bn->magic) != bset_magic(c))
- return 0;
-
- while (offset < btree_sectors(c)) {
- if (!offset) {
- offset += vstruct_sectors(bn, c->block_bits);
- } else {
- bne = data + (offset << 9);
- if (bne->keys.seq != bn->keys.seq)
- break;
- offset += vstruct_sectors(bne, c->block_bits);
- }
- }
-
- return offset;
-}
-
-static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
-{
- struct btree_node *bn = data;
- struct btree_node_entry *bne;
-
- if (!offset)
- return false;
-
- while (offset < btree_sectors(c)) {
- bne = data + (offset << 9);
- if (bne->keys.seq == bn->keys.seq)
- return true;
- offset++;
- }
-
- return false;
- return offset;
-}
-
-static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
-{
- closure_type(ra, struct btree_node_read_all, cl);
- struct bch_fs *c = ra->c;
- struct btree *b = ra->b;
- struct printbuf buf = PRINTBUF;
- bool dump_bset_maps = false;
- int ret = 0, best = -1, write = READ;
- unsigned i, written = 0, written2 = 0;
- __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
- ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
- bool _saw_error = false, *saw_error = &_saw_error;
- struct printbuf *err_msg = NULL;
- struct bch_io_failures *failed = NULL;
-
- for (i = 0; i < ra->nr; i++) {
- struct btree_node *bn = ra->buf[i];
-
- if (ra->err[i])
- continue;
-
- if (le64_to_cpu(bn->magic) != bset_magic(c) ||
- (seq && seq != bn->keys.seq))
- continue;
-
- if (best < 0) {
- best = i;
- written = btree_node_sectors_written(c, bn);
- continue;
- }
-
- written2 = btree_node_sectors_written(c, ra->buf[i]);
- if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL, NULL,
- btree_node_replicas_sectors_written_mismatch,
- "btree node sectors written mismatch: %u != %u",
- written, written2) ||
- btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL, NULL,
- btree_node_bset_after_end,
- "found bset signature after last bset") ||
- btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
- -BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, NULL, NULL,
- btree_node_replicas_data_mismatch,
- "btree node replicas content mismatch"))
- dump_bset_maps = true;
-
- if (written2 > written) {
- written = written2;
- best = i;
- }
- }
-fsck_err:
- if (dump_bset_maps) {
- for (i = 0; i < ra->nr; i++) {
- struct btree_node *bn = ra->buf[i];
- struct btree_node_entry *bne = NULL;
- unsigned offset = 0, sectors;
- bool gap = false;
-
- if (ra->err[i])
- continue;
-
- printbuf_reset(&buf);
-
- while (offset < btree_sectors(c)) {
- if (!offset) {
- sectors = vstruct_sectors(bn, c->block_bits);
- } else {
- bne = ra->buf[i] + (offset << 9);
- if (bne->keys.seq != bn->keys.seq)
- break;
- sectors = vstruct_sectors(bne, c->block_bits);
- }
-
- prt_printf(&buf, " %u-%u", offset, offset + sectors);
- if (bne && bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq), false))
- prt_printf(&buf, "*");
- offset += sectors;
- }
-
- while (offset < btree_sectors(c)) {
- bne = ra->buf[i] + (offset << 9);
- if (bne->keys.seq == bn->keys.seq) {
- if (!gap)
- prt_printf(&buf, " GAP");
- gap = true;
-
- sectors = vstruct_sectors(bne, c->block_bits);
- prt_printf(&buf, " %u-%u", offset, offset + sectors);
- if (bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq), false))
- prt_printf(&buf, "*");
- }
- offset++;
- }
-
- bch_err(c, "replica %u:%s", i, buf.buf);
- }
- }
-
- if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
- ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
- } else {
- ret = -1;
- }
-
- if (ret) {
- set_btree_node_read_error(b);
-
- struct printbuf buf = PRINTBUF;
- bch2_btree_lost_data(c, &buf, b->c.btree_id);
- if (buf.pos)
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- } else if (*saw_error)
- bch2_btree_node_rewrite_async(c, b);
-
- for (i = 0; i < ra->nr; i++) {
- mempool_free(ra->buf[i], &c->btree_bounce_pool);
- bio_put(ra->bio[i]);
- }
-
- closure_debug_destroy(&ra->cl);
- kfree(ra);
- printbuf_exit(&buf);
-
- clear_btree_node_read_in_flight(b);
- smp_mb__after_atomic();
- wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-static void btree_node_read_all_replicas_endio(struct bio *bio)
-{
- struct btree_read_bio *rb =
- container_of(bio, struct btree_read_bio, bio);
- struct bch_fs *c = rb->c;
- struct btree_node_read_all *ra = rb->ra;
-
- if (rb->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
- bch2_latency_acct(ca, rb->start_time, READ);
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_btree_node_read_all_replicas);
- }
-
- ra->err[rb->idx] = bio->bi_status;
- closure_put(&ra->cl);
-}
-
-/*
- * XXX This allocates multiple times from the same mempools, and can deadlock
- * under sufficient memory pressure (but is only a debug path)
- */
-static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
-{
- struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded pick;
- struct btree_node_read_all *ra;
- unsigned i;
-
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
- if (!ra)
- return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
-
- closure_init(&ra->cl, NULL);
- ra->c = c;
- ra->b = b;
- ra->nr = bch2_bkey_nr_ptrs(k);
-
- for (i = 0; i < ra->nr; i++) {
- ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
- ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_buf_bytes(b)),
- REQ_OP_READ|REQ_SYNC|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- }
-
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
- BCH_DEV_READ_REF_btree_node_read_all_replicas);
- struct btree_read_bio *rb =
- container_of(ra->bio[i], struct btree_read_bio, bio);
- rb->c = c;
- rb->b = b;
- rb->ra = ra;
- rb->start_time = local_clock();
- rb->have_ioref = ca != NULL;
- rb->idx = i;
- rb->pick = pick;
- rb->bio.bi_iter.bi_sector = pick.ptr.offset;
- rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
-
- if (rb->have_ioref) {
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
- bio_sectors(&rb->bio));
- bio_set_dev(&rb->bio, ca->disk_sb.bdev);
-
- closure_get(&ra->cl);
- submit_bio(&rb->bio);
- } else {
- ra->err[i] = BLK_STS_REMOVED;
- }
-
- i++;
- }
-
- if (sync) {
- closure_sync(&ra->cl);
- btree_node_read_all_replicas_done(&ra->cl.work);
- } else {
- continue_at(&ra->cl, btree_node_read_all_replicas_done,
- c->btree_read_complete_wq);
- }
-
- return 0;
-}
-
-void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
- bool sync)
-{
- struct bch_fs *c = trans->c;
- struct extent_ptr_decoded pick;
- struct btree_read_bio *rb;
- struct bch_dev *ca;
- struct bio *bio;
- int ret;
-
- trace_and_count(c, btree_node_read, trans, b);
-
- if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
- !btree_node_read_all_replicas(c, b, sync))
- return;
-
- ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
- NULL, &pick, -1);
-
- if (ret <= 0) {
- bool ratelimit = true;
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_str(&buf, "btree node read error: no device to read from\n at ");
- bch2_btree_pos_to_text(&buf, c, b);
- prt_newline(&buf);
- bch2_btree_lost_data(c, &buf, b->c.btree_id);
-
- if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
- bch2_fs_emergency_read_only2(c, &buf))
- ratelimit = false;
-
- static DEFINE_RATELIMIT_STATE(rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
- if (!ratelimit || __ratelimit(&rs))
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- set_btree_node_read_error(b);
- clear_btree_node_read_in_flight(b);
- smp_mb__after_atomic();
- wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
- return;
- }
-
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
-
- bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_buf_bytes(b)),
- REQ_OP_READ|REQ_SYNC|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- rb = container_of(bio, struct btree_read_bio, bio);
- rb->c = c;
- rb->b = b;
- rb->ra = NULL;
- rb->start_time = local_clock();
- rb->have_ioref = ca != NULL;
- rb->pick = pick;
- INIT_WORK(&rb->work, btree_node_read_work);
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_buf_bytes(b));
-
- async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
-
- if (rb->have_ioref) {
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
- bio_sectors(bio));
- bio_set_dev(bio, ca->disk_sb.bdev);
-
- if (sync) {
- submit_bio_wait(bio);
- bch2_latency_acct(ca, rb->start_time, READ);
- btree_node_read_work(&rb->work);
- } else {
- submit_bio(bio);
- }
- } else {
- bio->bi_status = BLK_STS_REMOVED;
-
- if (sync)
- btree_node_read_work(&rb->work);
- else
- queue_work(c->btree_read_complete_wq, &rb->work);
- }
-}
-
-static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
- const struct bkey_i *k, unsigned level)
-{
- struct bch_fs *c = trans->c;
- struct closure cl;
- struct btree *b;
- int ret;
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- closure_sync(&cl);
- } while (ret);
-
- b = bch2_btree_node_mem_alloc(trans, level != 0);
- bch2_btree_cache_cannibalize_unlock(trans);
-
- BUG_ON(IS_ERR(b));
-
- bkey_copy(&b->key, k);
- BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
-
- set_btree_node_read_in_flight(b);
-
- /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
- bch2_trans_unlock(trans);
- bch2_btree_node_read(trans, b, true);
-
- if (btree_node_read_error(b)) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- ret = bch_err_throw(c, btree_node_read_error);
- goto err;
- }
-
- bch2_btree_set_root_for_read(c, b);
-err:
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- return ret;
-}
-
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
- const struct bkey_i *k, unsigned level)
-{
- return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
-}
-
-struct btree_node_scrub {
- struct bch_fs *c;
- struct bch_dev *ca;
- void *buf;
- bool used_mempool;
- unsigned written;
-
- enum btree_id btree;
- unsigned level;
- struct bkey_buf key;
- __le64 seq;
-
- struct work_struct work;
- struct bio bio;
-};
-
-static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
- struct printbuf *err)
-{
- unsigned written = 0;
-
- if (le64_to_cpu(data->magic) != bset_magic(c)) {
- prt_printf(err, "bad magic: want %llx, got %llx",
- bset_magic(c), le64_to_cpu(data->magic));
- return false;
- }
-
- while (written < (ptr_written ?: btree_sectors(c))) {
- struct btree_node_entry *bne;
- struct bset *i;
- bool first = !written;
-
- if (first) {
- bne = NULL;
- i = &data->keys;
- } else {
- bne = (void *) data + (written << 9);
- i = &bne->keys;
-
- if (!ptr_written && i->seq != data->keys.seq)
- break;
- }
-
- struct nonce nonce = btree_nonce(i, written << 9);
- bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
-
- if (first) {
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
- if (bch2_crc_cmp(data->csum, csum)) {
- bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
- return false;
- }
- }
-
- written += vstruct_sectors(data, c->block_bits);
- } else {
- if (good_csum_type) {
- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- if (bch2_crc_cmp(bne->csum, csum)) {
- bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
- return false;
- }
- }
-
- written += vstruct_sectors(bne, c->block_bits);
- }
- }
-
- return true;
-}
-
-static void btree_node_scrub_work(struct work_struct *work)
-{
- struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
- struct bch_fs *c = scrub->c;
- struct printbuf err = PRINTBUF;
-
- __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
- bkey_i_to_s_c(scrub->key.k));
- prt_newline(&err);
-
- if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
- int ret = bch2_trans_do(c,
- bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
- scrub->key.k, 0));
- if (!bch2_err_matches(ret, ENOENT) &&
- !bch2_err_matches(ret, EROFS))
- bch_err_fn_ratelimited(c, ret);
- }
-
- printbuf_exit(&err);
- bch2_bkey_buf_exit(&scrub->key, c);;
- btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
- enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
- kfree(scrub);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
-}
-
-static void btree_node_scrub_endio(struct bio *bio)
-{
- struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
-
- queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
-}
-
-int bch2_btree_node_scrub(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c k, unsigned dev)
-{
- if (k.k->type != KEY_TYPE_btree_ptr_v2)
- return 0;
-
- struct bch_fs *c = trans->c;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
- return bch_err_throw(c, erofs_no_writes);
-
- struct extent_ptr_decoded pick;
- int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
- if (ret <= 0)
- goto err;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
- BCH_DEV_READ_REF_btree_node_scrub);
- if (!ca) {
- ret = bch_err_throw(c, device_offline);
- goto err;
- }
-
- bool used_mempool = false;
- void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
-
- unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
-
- struct btree_node_scrub *scrub =
- kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
- if (!scrub) {
- ret = -ENOMEM;
- goto err_free;
- }
-
- scrub->c = c;
- scrub->ca = ca;
- scrub->buf = buf;
- scrub->used_mempool = used_mempool;
- scrub->written = btree_ptr_sectors_written(k);
-
- scrub->btree = btree;
- scrub->level = level;
- bch2_bkey_buf_init(&scrub->key);
- bch2_bkey_buf_reassemble(&scrub->key, c, k);
- scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
-
- INIT_WORK(&scrub->work, btree_node_scrub_work);
-
- bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
- bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
- scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
- scrub->bio.bi_end_io = btree_node_scrub_endio;
- submit_bio(&scrub->bio);
- return 0;
-err_free:
- btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
-err:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
- return ret;
-}
-
-static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
- struct btree_write *w)
-{
- unsigned long old, new;
-
- old = READ_ONCE(b->will_make_reachable);
- do {
- new = old;
- if (!(old & 1))
- break;
-
- new &= ~1UL;
- } while (!try_cmpxchg(&b->will_make_reachable, &old, new));
-
- if (old & 1)
- closure_put(&((struct btree_update *) new)->cl);
-
- bch2_journal_pin_drop(&c->journal, &w->journal);
-}
-
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
- struct btree_write *w = btree_prev_write(b);
- unsigned long old, new;
- unsigned type = 0;
-
- bch2_btree_complete_write(c, b, w);
-
- if (start_time)
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
-
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- if ((old & (1U << BTREE_NODE_dirty)) &&
- (old & (1U << BTREE_NODE_need_write)) &&
- !(old & (1U << BTREE_NODE_never_write)) &&
- !(old & (1U << BTREE_NODE_write_blocked)) &&
- !(old & (1U << BTREE_NODE_will_make_reachable))) {
- new &= ~(1U << BTREE_NODE_dirty);
- new &= ~(1U << BTREE_NODE_need_write);
- new |= (1U << BTREE_NODE_write_in_flight);
- new |= (1U << BTREE_NODE_write_in_flight_inner);
- new |= (1U << BTREE_NODE_just_written);
- new ^= (1U << BTREE_NODE_write_idx);
-
- type = new & BTREE_WRITE_TYPE_MASK;
- new &= ~BTREE_WRITE_TYPE_MASK;
- } else {
- new &= ~(1U << BTREE_NODE_write_in_flight);
- new &= ~(1U << BTREE_NODE_write_in_flight_inner);
- }
- } while (!try_cmpxchg(&b->flags, &old, new));
-
- if (new & (1U << BTREE_NODE_write_in_flight))
- __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
- else {
- smp_mb__after_atomic();
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
- }
-}
-
-static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
-{
- struct btree_trans *trans = bch2_trans_get(c);
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
- /* we don't need transaction context anymore after we got the lock. */
- bch2_trans_put(trans);
- __btree_node_write_done(c, b, start_time);
- six_unlock_read(&b->c.lock);
-}
-
-static void btree_node_write_work(struct work_struct *work)
-{
- struct btree_write_bio *wbio =
- container_of(work, struct btree_write_bio, work);
- struct bch_fs *c = wbio->wbio.c;
- struct btree *b = wbio->wbio.bio.bi_private;
- u64 start_time = wbio->start_time;
- int ret = 0;
-
- btree_bounce_free(c,
- wbio->data_bytes,
- wbio->wbio.used_mempool,
- wbio->data);
-
- bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
- bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
-
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = bch_err_throw(c, btree_node_write_all_failed);
- goto err;
- }
-
- if (wbio->wbio.first_btree_write) {
- if (wbio->wbio.failed.nr) {
-
- }
- } else {
- ret = bch2_trans_do(c,
- bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
- BCH_WATERMARK_interior_updates|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw,
- !wbio->wbio.failed.nr));
- if (ret)
- goto err;
- }
-out:
- async_object_list_del(c, btree_write_bio, wbio->list_idx);
- bio_put(&wbio->wbio.bio);
- btree_node_write_done(c, b, start_time);
- return;
-err:
- set_btree_node_noevict(b);
-
- if (!bch2_err_matches(ret, EROFS)) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
- bch2_btree_pos_to_text(&buf, c, b);
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- goto out;
-}
-
-static void btree_node_write_endio(struct bio *bio)
-{
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
- struct bch_write_bio *orig = parent ?: wbio;
- struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
- struct bch_fs *c = wbio->c;
- struct btree *b = wbio->bio.bi_private;
- struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
- wbio->submit_time, !bio->bi_status);
-
- if (ca && bio->bi_status) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_printf(&buf, "btree write error: %s\n ",
- bch2_blk_status_to_str(bio->bi_status));
- bch2_btree_pos_to_text(&buf, c, b);
- bch_err_dev_ratelimited(ca, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (bio->bi_status) {
- unsigned long flags;
- spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bch2_dev_list_add_dev(&orig->failed, wbio->dev);
- spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
- }
-
- /*
- * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
- * btree writes yet (due to device removal/ro):
- */
- if (wbio->have_ioref)
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_btree_node_write);
-
- if (parent) {
- bio_put(bio);
- bio_endio(&parent->bio);
- return;
- }
-
- clear_btree_node_write_in_flight_inner(b);
- smp_mb__after_atomic();
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
- INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(c->btree_write_complete_wq, &wb->work);
-}
-
-static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
- struct bset *i)
-{
- int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
- (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level + 1,
- .btree = b->c.btree_id,
- .flags = BCH_VALIDATE_write,
- });
- if (ret) {
- bch2_fs_inconsistent(c, "invalid btree node key before write");
- return ret;
- }
-
- ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
- validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
- if (ret) {
- bch2_inconsistent_error(c);
- dump_stack();
- }
-
- return ret;
-}
-
-static void btree_write_submit(struct work_struct *work)
-{
- struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
- BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-
- bkey_copy(&tmp.k, &wbio->key);
-
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
- ptr->offset += wbio->sector_offset;
-
- bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
- &tmp.k, false);
-}
-
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
-{
- struct btree_write_bio *wbio;
- struct bset *i;
- struct btree_node *bn = NULL;
- struct btree_node_entry *bne = NULL;
- struct sort_iter_stack sort_iter;
- struct nonce nonce;
- unsigned bytes_to_write, sectors_to_write, bytes, u64s;
- u64 seq = 0;
- bool used_mempool;
- unsigned long old, new;
- bool validate_before_checksum = false;
- enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
- void *data;
- u64 start_time = local_clock();
- int ret;
-
- if (flags & BTREE_WRITE_ALREADY_STARTED)
- goto do_write;
-
- /*
- * We may only have a read lock on the btree node - the dirty bit is our
- * "lock" against racing with other threads that may be trying to start
- * a write, we do a write iff we clear the dirty bit. Since setting the
- * dirty bit requires a write lock, we can't race with other threads
- * redirtying it:
- */
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- if (!(old & (1 << BTREE_NODE_dirty)))
- return;
-
- if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
- !(old & (1 << BTREE_NODE_need_write)))
- return;
-
- if (old &
- ((1 << BTREE_NODE_never_write)|
- (1 << BTREE_NODE_write_blocked)))
- return;
-
- if (b->written &&
- (old & (1 << BTREE_NODE_will_make_reachable)))
- return;
-
- if (old & (1 << BTREE_NODE_write_in_flight))
- return;
-
- if (flags & BTREE_WRITE_ONLY_IF_NEED)
- type = new & BTREE_WRITE_TYPE_MASK;
- new &= ~BTREE_WRITE_TYPE_MASK;
-
- new &= ~(1 << BTREE_NODE_dirty);
- new &= ~(1 << BTREE_NODE_need_write);
- new |= (1 << BTREE_NODE_write_in_flight);
- new |= (1 << BTREE_NODE_write_in_flight_inner);
- new |= (1 << BTREE_NODE_just_written);
- new ^= (1 << BTREE_NODE_write_idx);
- } while (!try_cmpxchg_acquire(&b->flags, &old, new));
-
- if (new & (1U << BTREE_NODE_need_write))
- return;
-do_write:
- BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
-
- atomic_long_dec(&c->btree_cache.nr_dirty);
-
- BUG_ON(btree_node_fake(b));
- BUG_ON((b->will_make_reachable != 0) != !b->written);
-
- BUG_ON(b->written >= btree_sectors(c));
- BUG_ON(b->written & (block_sectors(c) - 1));
- BUG_ON(bset_written(b, btree_bset_last(b)));
- BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
- BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
-
- bch2_sort_whiteouts(c, b);
-
- sort_iter_stack_init(&sort_iter, b);
-
- bytes = !b->written
- ? sizeof(struct btree_node)
- : sizeof(struct btree_node_entry);
-
- bytes += b->whiteout_u64s * sizeof(u64);
-
- for_each_bset(b, t) {
- i = bset(b, t);
-
- if (bset_written(b, i))
- continue;
-
- bytes += le16_to_cpu(i->u64s) * sizeof(u64);
- sort_iter_add(&sort_iter.iter,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- seq = max(seq, le64_to_cpu(i->journal_seq));
- }
-
- BUG_ON(b->written && !seq);
-
- /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
- bytes += 8;
-
- /* buffer must be a multiple of the block size */
- bytes = round_up(bytes, block_bytes(c));
-
- data = btree_bounce_alloc(c, bytes, &used_mempool);
-
- if (!b->written) {
- bn = data;
- *bn = *b->data;
- i = &bn->keys;
- } else {
- bne = data;
- bne->keys = b->data->keys;
- i = &bne->keys;
- }
-
- i->journal_seq = cpu_to_le64(seq);
- i->u64s = 0;
-
- sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(b),
- unwritten_whiteouts_end(b));
- SET_BSET_SEPARATE_WHITEOUTS(i, false);
-
- u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
- le16_add_cpu(&i->u64s, u64s);
-
- b->whiteout_u64s = 0;
-
- BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
-
- set_needs_whiteout(i, false);
-
- /* do we have data to write? */
- if (b->written && !i->u64s)
- goto nowrite;
-
- bytes_to_write = vstruct_end(i) - data;
- sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
- if (!b->written &&
- b->key.k.type == KEY_TYPE_btree_ptr_v2)
- BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
-
- memset(data + bytes_to_write, 0,
- (sectors_to_write << 9) - bytes_to_write);
-
- BUG_ON(b->written + sectors_to_write > btree_sectors(c));
- BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
- BUG_ON(i->seq != b->data->keys.seq);
-
- i->version = cpu_to_le16(c->sb.version);
- SET_BSET_OFFSET(i, b->written);
- SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
-
- if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
- validate_before_checksum = true;
-
- /* validate_bset will be modifying: */
- if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
- validate_before_checksum = true;
-
- /* if we're going to be encrypting, check metadata validity first: */
- if (validate_before_checksum &&
- validate_bset_for_write(c, b, i))
- goto err;
-
- ret = bset_encrypt(c, i, b->written << 9);
- if (bch2_fs_fatal_err_on(ret, c,
- "encrypting btree node: %s", bch2_err_str(ret)))
- goto err;
-
- nonce = btree_nonce(i, b->written << 9);
-
- if (bn)
- bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
- else
- bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- /* if we're not encrypting, check metadata after checksumming: */
- if (!validate_before_checksum &&
- validate_bset_for_write(c, b, i))
- goto err;
-
- /*
- * We handle btree write errors by immediately halting the journal -
- * after we've done that, we can't issue any subsequent btree writes
- * because they might have pointers to new nodes that failed to write.
- *
- * Furthermore, there's no point in doing any more btree writes because
- * with the journal stopped, we're never going to update the journal to
- * reflect that those writes were done and the data flushed from the
- * journal:
- *
- * Also on journal error, the pending write may have updates that were
- * never journalled (interior nodes, see btree_update_nodes_written()) -
- * it's critical that we don't do the write in that case otherwise we
- * will have updates visible that weren't in the journal:
- *
- * Make sure to update b->written so bch2_btree_init_next() doesn't
- * break:
- */
- if (bch2_journal_error(&c->journal) ||
- c->opts.nochanges)
- goto err;
-
- trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
-
- wbio = container_of(bio_alloc_bioset(NULL,
- buf_pages(data, sectors_to_write << 9),
- REQ_OP_WRITE|REQ_META,
- GFP_NOFS,
- &c->btree_bio),
- struct btree_write_bio, wbio.bio);
- wbio_init(&wbio->wbio.bio);
- wbio->data = data;
- wbio->data_bytes = bytes;
- wbio->sector_offset = b->written;
- wbio->start_time = start_time;
- wbio->wbio.c = c;
- wbio->wbio.used_mempool = used_mempool;
- wbio->wbio.first_btree_write = !b->written;
- wbio->wbio.bio.bi_end_io = btree_node_write_endio;
- wbio->wbio.bio.bi_private = b;
-
- bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
-
- bkey_copy(&wbio->key, &b->key);
-
- b->written += sectors_to_write;
-
- if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
- cpu_to_le16(b->written);
-
- atomic64_inc(&c->btree_write_stats[type].nr);
- atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
-
- async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
-
- INIT_WORK(&wbio->work, btree_write_submit);
- queue_work(c->btree_write_submit_wq, &wbio->work);
- return;
-err:
- set_btree_node_noevict(b);
- b->written += sectors_to_write;
-nowrite:
- btree_bounce_free(c, bytes, used_mempool, data);
- __btree_node_write_done(c, b, 0);
-}
-
-/*
- * Work that must be done with write lock held:
- */
-bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
-{
- bool invalidated_iter = false;
- struct btree_node_entry *bne;
-
- if (!btree_node_just_written(b))
- return false;
-
- BUG_ON(b->whiteout_u64s);
-
- clear_btree_node_just_written(b);
-
- /*
- * Note: immediately after write, bset_written() doesn't work - the
- * amount of data we had to write after compaction might have been
- * smaller than the offset of the last bset.
- *
- * However, we know that all bsets have been written here, as long as
- * we're still holding the write lock:
- */
-
- /*
- * XXX: decide if we really want to unconditionally sort down to a
- * single bset:
- */
- if (b->nsets > 1) {
- btree_node_sort(c, b, 0, b->nsets);
- invalidated_iter = true;
- } else {
- invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
- }
-
- for_each_bset(b, t)
- set_needs_whiteout(bset(b, t), true);
-
- bch2_btree_verify(c, b);
-
- /*
- * If later we don't unconditionally sort down to a single bset, we have
- * to ensure this is still true:
- */
- BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
-
- bne = want_new_bset(c, b);
- if (bne)
- bch2_bset_init_next(b, bne);
-
- bch2_btree_build_aux_trees(b);
-
- return invalidated_iter;
-}
-
-/*
- * Use this one if the node is intent locked:
- */
-void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- enum six_lock_type lock_type_held,
- unsigned flags)
-{
- if (lock_type_held == SIX_LOCK_intent ||
- (lock_type_held == SIX_LOCK_read &&
- six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, flags);
-
- /* don't cycle lock unnecessarily: */
- if (btree_node_just_written(b) &&
- six_trylock_write(&b->c.lock)) {
- bch2_btree_post_write_cleanup(c, b);
- six_unlock_write(&b->c.lock);
- }
-
- if (lock_type_held == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
- } else {
- __bch2_btree_node_write(c, b, flags);
- if (lock_type_held == SIX_LOCK_write &&
- btree_node_just_written(b))
- bch2_btree_post_write_cleanup(c, b);
- }
-}
-
-void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
- enum six_lock_type lock_type_held,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
-
- if (lock_type_held == SIX_LOCK_intent ||
- (lock_type_held == SIX_LOCK_read &&
- six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, flags);
-
- /* don't cycle lock unnecessarily: */
- if (btree_node_just_written(b) &&
- six_trylock_write(&b->c.lock)) {
- bch2_btree_post_write_cleanup(c, b);
- __bch2_btree_node_unlock_write(trans, b);
- }
-
- if (lock_type_held == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
- } else {
- __bch2_btree_node_write(c, b, flags);
- if (lock_type_held == SIX_LOCK_write &&
- btree_node_just_written(b))
- bch2_btree_post_write_cleanup(c, b);
- }
-}
-
-static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
- bool ret = false;
-restart:
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos)
- if (test_bit(flag, &b->flags)) {
- rcu_read_unlock();
- wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
- ret = true;
- goto restart;
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *c)
-{
- return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
-}
-
-bool bch2_btree_flush_all_writes(struct bch_fs *c)
-{
- return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
-}
-
-static const char * const bch2_btree_write_types[] = {
-#define x(t, n) [n] = #t,
- BCH_BTREE_WRITE_TYPES()
- NULL
-};
-
-void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
- printbuf_tabstop_push(out, 20);
- printbuf_tabstop_push(out, 10);
-
- prt_printf(out, "\tnr\tsize\n");
-
- for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
- u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
- u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
-
- prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
- prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
- prt_newline(out);
- }
-}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
deleted file mode 100644
index 30a5180532c8..000000000000
--- a/fs/bcachefs/btree_io.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_IO_H
-#define _BCACHEFS_BTREE_IO_H
-
-#include "bkey_methods.h"
-#include "bset.h"
-#include "btree_locking.h"
-#include "checksum.h"
-#include "extents.h"
-#include "io_write_types.h"
-
-struct bch_fs;
-struct btree_write;
-struct btree;
-struct btree_iter;
-struct btree_node_read_all;
-
-static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
- if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
- atomic_long_inc(&c->btree_cache.nr_dirty);
-}
-
-static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
-{
- if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
- atomic_long_dec(&c->btree_cache.nr_dirty);
-}
-
-static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
-{
- return k.k->type == KEY_TYPE_btree_ptr_v2
- ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
- : 0;
-}
-
-struct btree_read_bio {
- struct bch_fs *c;
- struct btree *b;
- struct btree_node_read_all *ra;
- u64 start_time;
- unsigned have_ioref:1;
- unsigned idx:7;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- unsigned list_idx;
-#endif
- struct extent_ptr_decoded pick;
- struct work_struct work;
- struct bio bio;
-};
-
-struct btree_write_bio {
- struct work_struct work;
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
- void *data;
- unsigned data_bytes;
- unsigned sector_offset;
- u64 start_time;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- unsigned list_idx;
-#endif
- struct bch_write_bio wbio;
-};
-
-void bch2_btree_node_io_unlock(struct btree *);
-void bch2_btree_node_io_lock(struct btree *);
-void __bch2_btree_node_wait_on_read(struct btree *);
-void __bch2_btree_node_wait_on_write(struct btree *);
-void bch2_btree_node_wait_on_read(struct btree *);
-void bch2_btree_node_wait_on_write(struct btree *);
-
-enum compact_mode {
- COMPACT_LAZY,
- COMPACT_ALL,
-};
-
-bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
- enum compact_mode);
-
-static inline bool should_compact_bset_lazy(struct btree *b,
- struct bset_tree *t)
-{
- unsigned total_u64s = bset_u64s(t);
- unsigned dead_u64s = bset_dead_u64s(b, t);
-
- return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
-}
-
-static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
-{
- for_each_bset(b, t)
- if (should_compact_bset_lazy(b, t))
- return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
-
- return false;
-}
-
-static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32(offset),
- [1] = ((__le32 *) &i->seq)[0],
- [2] = ((__le32 *) &i->seq)[1],
- [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
- }};
-}
-
-static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
-{
- struct nonce nonce = btree_nonce(i, offset);
- int ret;
-
- if (!offset) {
- struct btree_node *bn = container_of(i, struct btree_node, keys);
- unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
- ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
- &bn->flags, bytes);
- if (ret)
- return ret;
-
- nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
- }
-
- return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
-}
-
-void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
-
-void bch2_btree_node_drop_keys_outside_node(struct btree *);
-
-void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree *);
-
-int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
- struct btree *,
- struct bch_io_failures *,
- struct printbuf *);
-void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
-int bch2_btree_root_read(struct bch_fs *, enum btree_id,
- const struct bkey_i *, unsigned);
-
-void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *);
-
-int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, unsigned);
-
-bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-
-enum btree_write_flags {
- __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
- __BTREE_WRITE_ALREADY_STARTED,
-};
-#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED)
-#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED)
-
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
-void bch2_btree_node_write(struct bch_fs *, struct btree *,
- enum six_lock_type, unsigned);
-void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
- enum six_lock_type, unsigned);
-
-static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
- enum six_lock_type lock_held)
-{
- bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
-}
-
-bool bch2_btree_flush_all_reads(struct bch_fs *);
-bool bch2_btree_flush_all_writes(struct bch_fs *);
-
-static inline void compat_bformat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write, struct bkey_format *f)
-{
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_inodes) {
- swap(f->bits_per_field[BKEY_FIELD_INODE],
- f->bits_per_field[BKEY_FIELD_OFFSET]);
- swap(f->field_offset[BKEY_FIELD_INODE],
- f->field_offset[BKEY_FIELD_OFFSET]);
- }
-
- if (version < bcachefs_metadata_version_snapshot &&
- (level || btree_type_has_snapshots(btree_id))) {
- u64 max_packed =
- ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
-
- f->field_offset[BKEY_FIELD_SNAPSHOT] = write
- ? 0
- : cpu_to_le64(U32_MAX - max_packed);
- }
-}
-
-static inline void compat_bpos(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write, struct bpos *p)
-{
- if (big_endian != CPU_BIG_ENDIAN)
- bch2_bpos_swab(p);
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_inodes)
- swap(p->inode, p->offset);
-}
-
-static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write,
- struct btree_node *bn)
-{
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id_is_extents(btree_id) &&
- !bpos_eq(bn->min_key, POS_MIN) &&
- write)
- bn->min_key = bpos_nosnap_predecessor(bn->min_key);
-
- if (version < bcachefs_metadata_version_snapshot &&
- write)
- bn->max_key.snapshot = 0;
-
- compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
- compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
-
- if (version < bcachefs_metadata_version_snapshot &&
- !write)
- bn->max_key.snapshot = U32_MAX;
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id_is_extents(btree_id) &&
- !bpos_eq(bn->min_key, POS_MIN) &&
- !write)
- bn->min_key = bpos_nosnap_successor(bn->min_key);
-}
-
-void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
deleted file mode 100644
index f8829b667ad3..000000000000
--- a/fs/bcachefs/btree_iter.c
+++ /dev/null
@@ -1,3804 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super.h"
-#include "trace.h"
-
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *,
- btree_path_idx_t, btree_path_idx_t);
-
-static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
-{
-#ifdef TRACK_PATH_ALLOCATED
- return iter->ip_allocated;
-#else
- return 0;
-#endif
-}
-
-static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
-static void bch2_trans_srcu_lock(struct btree_trans *);
-
-static inline int __btree_path_cmp(const struct btree_path *l,
- enum btree_id r_btree_id,
- bool r_cached,
- struct bpos r_pos,
- unsigned r_level)
-{
- /*
- * Must match lock ordering as defined by __bch2_btree_node_lock:
- */
- return cmp_int(l->btree_id, r_btree_id) ?:
- cmp_int((int) l->cached, (int) r_cached) ?:
- bpos_cmp(l->pos, r_pos) ?:
- -cmp_int(l->level, r_level);
-}
-
-static inline int btree_path_cmp(const struct btree_path *l,
- const struct btree_path *r)
-{
- return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
-}
-
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
-{
- /* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_all_snapshots) {
- p = bpos_successor(p);
- } else {
- p = bpos_nosnap_successor(p);
- p.snapshot = iter->snapshot;
- }
-
- return p;
-}
-
-static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
-{
- /* Are we iterating over keys in all snapshots? */
- if (iter->flags & BTREE_ITER_all_snapshots) {
- p = bpos_predecessor(p);
- } else {
- p = bpos_nosnap_predecessor(p);
- p.snapshot = iter->snapshot;
- }
-
- return p;
-}
-
-static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
-{
- struct bpos pos = iter->pos;
-
- if ((iter->flags & BTREE_ITER_is_extents) &&
- !bkey_eq(pos, POS_MAX))
- pos = bkey_successor(iter, pos);
- return pos;
-}
-
-static inline bool btree_path_pos_before_node(struct btree_path *path,
- struct btree *b)
-{
- return bpos_lt(path->pos, b->data->min_key);
-}
-
-static inline bool btree_path_pos_after_node(struct btree_path *path,
- struct btree *b)
-{
- return bpos_gt(path->pos, b->key.k.p);
-}
-
-static inline bool btree_path_pos_in_node(struct btree_path *path,
- struct btree *b)
-{
- return path->btree_id == b->c.btree_id &&
- !btree_path_pos_before_node(path, b) &&
- !btree_path_pos_after_node(path, b);
-}
-
-/* Debug: */
-
-static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bkey_cached *ck;
- bool locked = btree_node_locked(path, 0);
-
- if (!bch2_btree_node_relock(trans, path, 0))
- return;
-
- ck = (void *) path->l[0].b;
- BUG_ON(ck->key.btree_id != path->btree_id ||
- !bkey_eq(ck->key.pos, path->pos));
-
- if (!locked)
- btree_node_unlock(trans, path, 0);
-}
-
-static void __bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree_path_level *l;
- struct btree_node_iter tmp;
- bool locked;
- struct bkey_packed *p, *k;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- struct printbuf buf3 = PRINTBUF;
- const char *msg;
-
- l = &path->l[level];
- tmp = l->iter;
- locked = btree_node_locked(path, level);
-
- if (path->cached) {
- if (!level)
- __bch2_btree_path_verify_cached(trans, path);
- return;
- }
-
- if (!btree_path_node(path, level))
- return;
-
- if (!bch2_btree_node_relock_notrace(trans, path, level))
- return;
-
- BUG_ON(!btree_path_pos_in_node(path, l->b));
-
- bch2_btree_node_iter_verify(&l->iter, l->b);
-
- /*
- * For interior nodes, the iterator will have skipped past deleted keys:
- */
- p = level
- ? bch2_btree_node_iter_prev(&tmp, l->b)
- : bch2_btree_node_iter_prev_all(&tmp, l->b);
- k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
- if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
- msg = "before";
- goto err;
- }
-
- if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
- msg = "after";
- goto err;
- }
-
- if (!locked)
- btree_node_unlock(trans, path, level);
- return;
-err:
- bch2_bpos_to_text(&buf1, path->pos);
-
- if (p) {
- struct bkey uk = bkey_unpack_key(l->b, p);
-
- bch2_bkey_to_text(&buf2, &uk);
- } else {
- prt_printf(&buf2, "(none)");
- }
-
- if (k) {
- struct bkey uk = bkey_unpack_key(l->b, k);
-
- bch2_bkey_to_text(&buf3, &uk);
- } else {
- prt_printf(&buf3, "(none)");
- }
-
- panic("path should be %s key at level %u:\n"
- "path pos %s\n"
- "prev key %s\n"
- "cur key %s\n",
- msg, level, buf1.buf, buf2.buf, buf3.buf);
-}
-
-static void __bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
-
- for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
- if (!path->l[i].b) {
- BUG_ON(!path->cached &&
- bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
- break;
- }
-
- __bch2_btree_path_verify_level(trans, path, i);
- }
-
- bch2_btree_path_verify_locks(trans, path);
-}
-
-void __bch2_trans_verify_paths(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned iter;
-
- trans_for_each_path(trans, path, iter)
- __bch2_btree_path_verify(trans, path);
-}
-
-static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
-{
- BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
-
- BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
- (iter->flags & BTREE_ITER_all_snapshots));
-
- BUG_ON(!(iter->flags & BTREE_ITER_snapshot_field) &&
- (iter->flags & BTREE_ITER_all_snapshots) &&
- !btree_type_has_snapshot_field(iter->btree_id));
-
- if (iter->update_path)
- __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
- __bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
-}
-
-static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
- BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
- !iter->pos.snapshot);
-
- BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
- iter->pos.snapshot != iter->snapshot);
-
- BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
- !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
- (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
- bkey_gt(iter->pos, iter->k.p)));
-}
-
-static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
- struct btree_iter *iter, struct bkey_s_c k)
-{
- struct btree_iter copy;
- struct bkey_s_c prev;
- int ret = 0;
-
- if (!(iter->flags & BTREE_ITER_filter_snapshots))
- return 0;
-
- if (bkey_err(k) || !k.k)
- return 0;
-
- BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot));
-
- bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
- BTREE_ITER_nopreserve|
- BTREE_ITER_all_snapshots);
- prev = bch2_btree_iter_prev(trans, &copy);
- if (!prev.k)
- goto out;
-
- ret = bkey_err(prev);
- if (ret)
- goto out;
-
- if (bkey_eq(prev.k->p, k.k->p) &&
- bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
- prev.k->p.snapshot) > 0) {
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- bch2_bkey_to_text(&buf1, k.k);
- bch2_bkey_to_text(&buf2, prev.k);
-
- panic("iter snap %u\n"
- "k %s\n"
- "prev %s\n",
- iter->snapshot,
- buf1.buf, buf2.buf);
- }
-out:
- bch2_trans_iter_exit(trans, &copy);
- return ret;
-}
-
-void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- struct btree_path *path;
- struct trans_for_each_path_inorder_iter iter;
- struct printbuf buf = PRINTBUF;
-
- btree_trans_sort_paths(trans);
-
- trans_for_each_path_inorder(trans, path, iter) {
- if (path->btree_id != id ||
- !btree_node_locked(path, 0) ||
- !path->should_be_locked)
- continue;
-
- if (!path->cached) {
- if (bkey_ge(pos, path->l[0].b->data->min_key) &&
- bkey_le(pos, path->l[0].b->key.k.p))
- return;
- } else {
- if (bkey_eq(pos, path->pos))
- return;
- }
- }
-
- bch2_dump_trans_paths_updates(trans);
- bch2_bpos_to_text(&buf, pos);
-
- panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
-}
-
-static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned l)
-{
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- __bch2_btree_path_verify_level(trans, path, l);
-}
-
-static inline void bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path)
-{
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- __bch2_btree_path_verify(trans, path);
-}
-
-static inline void bch2_btree_iter_verify(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- __bch2_btree_iter_verify(trans, iter);
-}
-
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
-{
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- __bch2_btree_iter_verify_entry_exit(iter);
-}
-
-static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- return static_branch_unlikely(&bch2_debug_check_iterators)
- ? __bch2_btree_iter_verify_ret(trans, iter, k)
- : 0;
-}
-
-/* Btree path: fixups after btree updates */
-
-static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
- struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct btree_node_iter_set *set;
-
- btree_node_iter_for_each(iter, set)
- if (set->end == t->end_offset) {
- set->k = __btree_node_key_to_offset(b, k);
- bch2_btree_node_iter_sort(iter, b);
- return;
- }
-
- bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
-}
-
-static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
- struct btree *b,
- struct bkey_packed *where)
-{
- struct btree_path_level *l = &path->l[b->c.level];
-
- if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
- return;
-
- if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
- bch2_btree_node_iter_advance(&l->iter, l->b);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
- struct btree *b,
- struct bkey_packed *where)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path_with_node(trans, b, path, i) {
- __bch2_btree_path_fix_key_modified(path, b, where);
- bch2_btree_path_verify_level(trans, path, b->c.level);
- }
-}
-
-static void __bch2_btree_node_iter_fix(struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bset_tree *t,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- const struct bkey_packed *end = btree_bkey_last(b, t);
- struct btree_node_iter_set *set;
- unsigned offset = __btree_node_key_to_offset(b, where);
- int shift = new_u64s - clobber_u64s;
- unsigned old_end = t->end_offset - shift;
- unsigned orig_iter_pos = node_iter->data[0].k;
- bool iter_current_key_modified =
- orig_iter_pos >= offset &&
- orig_iter_pos <= offset + clobber_u64s;
-
- btree_node_iter_for_each(node_iter, set)
- if (set->end == old_end)
- goto found;
-
- /* didn't find the bset in the iterator - might have to readd it: */
- if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
- bch2_btree_node_iter_push(node_iter, b, where, end);
- goto fixup_done;
- } else {
- /* Iterator is after key that changed */
- return;
- }
-found:
- set->end = t->end_offset;
-
- /* Iterator hasn't gotten to the key that changed yet: */
- if (set->k < offset)
- return;
-
- if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
- set->k = offset;
- } else if (set->k < offset + clobber_u64s) {
- set->k = offset + new_u64s;
- if (set->k == set->end)
- bch2_btree_node_iter_set_drop(node_iter, set);
- } else {
- /* Iterator is after key that changed */
- set->k = (int) set->k + shift;
- return;
- }
-
- bch2_btree_node_iter_sort(node_iter, b);
-fixup_done:
- if (node_iter->data[0].k != orig_iter_pos)
- iter_current_key_modified = true;
-
- /*
- * When a new key is added, and the node iterator now points to that
- * key, the iterator might have skipped past deleted keys that should
- * come after the key the iterator now points to. We have to rewind to
- * before those deleted keys - otherwise
- * bch2_btree_node_iter_prev_all() breaks:
- */
- if (!bch2_btree_node_iter_end(node_iter) &&
- iter_current_key_modified &&
- b->c.level) {
- struct bkey_packed *k, *k2, *p;
-
- k = bch2_btree_node_iter_peek_all(node_iter, b);
-
- for_each_bset(b, t) {
- bool set_pos = false;
-
- if (node_iter->data[0].end == t->end_offset)
- continue;
-
- k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
- while ((p = bch2_bkey_prev_all(b, t, k2)) &&
- bkey_iter_cmp(b, k, p) < 0) {
- k2 = p;
- set_pos = true;
- }
-
- if (set_pos)
- btree_node_iter_set_set_pos(node_iter,
- b, t, k2);
- }
- }
-}
-
-void bch2_btree_node_iter_fix(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
- struct btree_path *linked;
- unsigned i;
-
- if (node_iter != &path->l[b->c.level].iter) {
- __bch2_btree_node_iter_fix(path, b, node_iter, t,
- where, clobber_u64s, new_u64s);
-
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- bch2_btree_node_iter_verify(node_iter, b);
- }
-
- trans_for_each_path_with_node(trans, b, linked, i) {
- __bch2_btree_node_iter_fix(linked, b,
- &linked->l[b->c.level].iter, t,
- where, clobber_u64s, new_u64s);
- bch2_btree_path_verify_level(trans, linked, b->c.level);
- }
-}
-
-/* Btree path level: pointer to a particular btree node and node iter */
-
-static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
- struct btree_path_level *l,
- struct bkey *u,
- struct bkey_packed *k)
-{
- if (unlikely(!k)) {
- /*
- * signal to bch2_btree_iter_peek_slot() that we're currently at
- * a hole
- */
- u->type = KEY_TYPE_deleted;
- return bkey_s_c_null;
- }
-
- return bkey_disassemble(l->b, k, u);
-}
-
-static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
- struct btree_path_level *l,
- struct bkey *u)
-{
- return __btree_iter_unpack(c, l, u,
- bch2_btree_node_iter_peek_all(&l->iter, l->b));
-}
-
-static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path_level *l,
- struct bkey *u)
-{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
- bch2_btree_node_iter_prev(&l->iter, l->b));
-
- path->pos = k.k ? k.k->p : l->b->data->min_key;
- trans->paths_sorted = false;
- bch2_btree_path_verify_level(trans, path, l - path->l);
- return k;
-}
-
-static inline bool btree_path_advance_to_pos(struct btree_path *path,
- struct btree_path_level *l,
- int max_advance)
-{
- struct bkey_packed *k;
- int nr_advanced = 0;
-
- while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
- if (max_advance > 0 && nr_advanced >= max_advance)
- return false;
-
- bch2_btree_node_iter_advance(&l->iter, l->b);
- nr_advanced++;
- }
-
- return true;
-}
-
-static inline void __btree_path_level_init(struct btree_path *path,
- unsigned level)
-{
- struct btree_path_level *l = &path->l[level];
-
- bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
- /*
- * Iterators to interior nodes should always be pointed at the first non
- * whiteout:
- */
- if (level)
- bch2_btree_node_iter_peek(&l->iter, l->b);
-}
-
-void bch2_btree_path_level_init(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- BUG_ON(path->cached);
-
- EBUG_ON(!btree_path_pos_in_node(path, b));
-
- path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
- path->l[b->c.level].b = b;
- __btree_path_level_init(path, b->c.level);
-}
-
-/* Btree path: fixups after btree node updates: */
-
-static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- trans_for_each_update(trans, i)
- if (!i->cached &&
- i->level == b->c.level &&
- i->btree_id == b->c.btree_id &&
- bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
- bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
- i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
- i->k->k.p);
-
- if (j_k) {
- i->old_k = j_k->k;
- i->old_v = &j_k->v;
- }
- }
- }
-}
-
-/*
- * A btree node is being replaced - update the iterator to point to the new
- * node:
- */
-void bch2_trans_node_add(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct btree_path *prev;
-
- BUG_ON(!btree_path_pos_in_node(path, b));
-
- while ((prev = prev_btree_path(trans, path)) &&
- btree_path_pos_in_node(prev, b))
- path = prev;
-
- for (;
- path && btree_path_pos_in_node(path, b);
- path = next_btree_path(trans, path))
- if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
- enum btree_node_locked_type t =
- btree_lock_want(path, b->c.level);
-
- if (t != BTREE_NODE_UNLOCKED) {
- btree_node_unlock(trans, path, b->c.level);
- six_lock_increment(&b->c.lock, (enum six_lock_type) t);
- mark_btree_node_locked(trans, path, b->c.level, t);
- }
-
- bch2_btree_path_level_init(trans, path, b);
- }
-
- bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-void bch2_trans_node_drop(struct btree_trans *trans,
- struct btree *b)
-{
- struct btree_path *path;
- unsigned i, level = b->c.level;
-
- trans_for_each_path(trans, path, i)
- if (path->l[level].b == b) {
- btree_node_unlock(trans, path, level);
- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
- }
-}
-
-/*
- * A btree node has been modified in such a way as to invalidate iterators - fix
- * them:
- */
-void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path_with_node(trans, b, path, i)
- __btree_path_level_init(path, b->c.level);
-
- bch2_trans_revalidate_updates_in_node(trans, b);
-}
-
-/* Btree path: traverse, set_pos: */
-
-static inline int btree_path_lock_root(struct btree_trans *trans,
- struct btree_path *path,
- unsigned depth_want,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
- enum six_lock_type lock_type;
- unsigned i;
- int ret;
-
- EBUG_ON(path->nodes_locked);
-
- while (1) {
- struct btree *b = READ_ONCE(r->b);
- if (unlikely(!b)) {
- BUG_ON(!r->error);
- return r->error;
- }
-
- path->level = READ_ONCE(b->c.level);
-
- if (unlikely(path->level < depth_want)) {
- /*
- * the root is at a lower depth than the depth we want:
- * got to the end of the btree, or we're walking nodes
- * greater than some depth and there are no nodes >=
- * that depth
- */
- path->level = depth_want;
- for (i = path->level; i < BTREE_MAX_DEPTH; i++)
- path->l[i].b = NULL;
- return 1;
- }
-
- lock_type = __btree_lock_want(path, path->level);
- ret = btree_node_lock(trans, path, &b->c,
- path->level, lock_type, trace_ip);
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
- BUG();
- }
-
- if (likely(b == READ_ONCE(r->b) &&
- b->c.level == path->level &&
- !race_fault())) {
- for (i = 0; i < path->level; i++)
- path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
- path->l[path->level].b = b;
- for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
- path->l[i].b = NULL;
-
- mark_btree_node_locked(trans, path, path->level,
- (enum btree_node_locked_type) lock_type);
- bch2_btree_path_level_init(trans, path, b);
- return 0;
- }
-
- six_unlock_type(&b->c.lock, lock_type);
- }
-}
-
-noinline
-static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l = path_l(path);
- struct btree_node_iter node_iter = l->iter;
- struct bkey_packed *k;
- struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_started, &c->flags)
- ? (path->level > 1 ? 0 : 2)
- : (path->level > 1 ? 1 : 16);
- bool was_locked = btree_node_locked(path, path->level);
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
-
- while (nr-- && !ret) {
- if (!bch2_btree_node_relock(trans, path, path->level))
- break;
-
- bch2_btree_node_iter_advance(&node_iter, l->b);
- k = bch2_btree_node_iter_peek(&node_iter, l->b);
- if (!k)
- break;
-
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
- path->level - 1);
- }
-
- if (!was_locked)
- btree_node_unlock(trans, path, path->level);
-
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
- struct btree_and_journal_iter *jiter)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_started, &c->flags)
- ? (path->level > 1 ? 0 : 2)
- : (path->level > 1 ? 1 : 16);
- bool was_locked = btree_node_locked(path, path->level);
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
-
- jiter->fail_if_too_many_whiteouts = true;
-
- while (nr-- && !ret) {
- if (!bch2_btree_node_relock(trans, path, path->level))
- break;
-
- bch2_btree_and_journal_iter_advance(jiter);
- k = bch2_btree_and_journal_iter_peek(jiter);
- if (!k.k)
- break;
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
- path->level - 1);
- }
-
- if (!was_locked)
- btree_node_unlock(trans, path, path->level);
-
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
- struct btree_path *path,
- unsigned plevel, struct btree *b)
-{
- struct btree_path_level *l = &path->l[plevel];
- bool locked = btree_node_locked(path, plevel);
- struct bkey_packed *k;
- struct bch_btree_ptr_v2 *bp;
-
- if (!bch2_btree_node_relock(trans, path, plevel))
- return;
-
- k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
-
- bp = (void *) bkeyp_val(&l->b->format, k);
- bp->mem_ptr = (unsigned long)b;
-
- if (!locked)
- btree_node_unlock(trans, path, plevel);
-}
-
-static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
- struct btree_path *path,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l = path_l(path);
- struct btree_and_journal_iter jiter;
- struct bkey_s_c k;
- int ret = 0;
-
- __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
-
- k = bch2_btree_and_journal_iter_peek(&jiter);
- if (!k.k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " at btree ");
- bch2_btree_pos_to_text(&buf, c, l->b);
-
- ret = bch2_fs_topology_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- bkey_reassemble(&trans->btree_path_down, k);
-
- if ((flags & BTREE_ITER_prefetch) &&
- c->opts.btree_node_prefetch)
- ret = btree_path_prefetch_j(trans, path, &jiter);
-
-err:
- bch2_btree_and_journal_iter_exit(&jiter);
- return ret;
-}
-
-static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " within parent node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
-
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return bch_err_throw(c, btree_need_topology_repair);
-}
-
-static __always_inline int btree_path_down(struct btree_trans *trans,
- struct btree_path *path,
- unsigned flags,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l = path_l(path);
- struct btree *b;
- unsigned level = path->level - 1;
- enum six_lock_type lock_type = __btree_lock_want(path, level);
- int ret;
-
- EBUG_ON(!btree_node_locked(path, path->level));
-
- if (unlikely(trans->journal_replay_not_finished)) {
- ret = btree_node_iter_and_journal_peek(trans, path, flags);
- if (ret)
- return ret;
- } else {
- struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (unlikely(!k))
- return btree_node_missing_err(trans, path);
-
- bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
-
- if (unlikely((flags & BTREE_ITER_prefetch)) &&
- c->opts.btree_node_prefetch) {
- ret = btree_path_prefetch(trans, path);
- if (ret)
- return ret;
- }
- }
-
- b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
- level, lock_type, trace_ip);
- ret = PTR_ERR_OR_ZERO(b);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
- likely(!trans->journal_replay_not_finished &&
- trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
- btree_node_mem_ptr_set(trans, path, level + 1, b);
-
- if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(trans, path, level + 1);
-
- mark_btree_node_locked(trans, path, level,
- (enum btree_node_locked_type) lock_type);
- path->level = level;
- bch2_btree_path_level_init(trans, path, b);
-
- bch2_btree_path_verify_locks(trans, path);
- return 0;
-}
-
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_path *path;
- unsigned long trace_ip = _RET_IP_;
- unsigned i;
- int ret = 0;
-
- if (trans->in_traverse_all)
- return bch_err_throw(c, transaction_restart_in_traverse_all);
-
- trans->in_traverse_all = true;
-retry_all:
- trans->restarted = 0;
- trans->last_restarted_ip = 0;
-
- trans_for_each_path(trans, path, i)
- path->should_be_locked = false;
-
- btree_trans_sort_paths(trans);
-
- bch2_trans_unlock(trans);
- cond_resched();
- trans_set_locked(trans, false);
-
- if (unlikely(trans->memory_allocation_failure)) {
- struct closure cl;
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- closure_sync(&cl);
- } while (ret);
- }
-
- /* Now, redo traversals in correct order: */
- i = 0;
- while (i < trans->nr_sorted) {
- btree_path_idx_t idx = trans->sorted[i];
-
- /*
- * Traversing a path can cause another path to be added at about
- * the same position:
- */
- if (trans->paths[idx].uptodate) {
- __btree_path_get(trans, &trans->paths[idx], false);
- ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
- __btree_path_put(trans, &trans->paths[idx], false);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, ENOMEM))
- goto retry_all;
- if (ret)
- goto err;
- } else {
- i++;
- }
- }
-
- /*
- * We used to assert that all paths had been traversed here
- * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
- * path->should_be_locked is not set yet, we might have unlocked and
- * then failed to relock a path - that's fine.
- */
-err:
- bch2_btree_cache_cannibalize_unlock(trans);
-
- trans->in_traverse_all = false;
-
- trace_and_count(c, trans_traverse_all, trans, trace_ip);
- return ret;
-}
-
-static inline bool btree_path_check_pos_in_node(struct btree_path *path,
- unsigned l, int check_pos)
-{
- if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
- return false;
- if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
- return false;
- return true;
-}
-
-static inline bool btree_path_good_node(struct btree_trans *trans,
- struct btree_path *path,
- unsigned l, int check_pos)
-{
- return is_btree_node(path, l) &&
- bch2_btree_node_relock(trans, path, l) &&
- btree_path_check_pos_in_node(path, l, check_pos);
-}
-
-static void btree_path_set_level_down(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_level)
-{
- unsigned l;
-
- path->level = new_level;
-
- for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
- if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(trans, path, l);
-
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
- bch2_btree_path_verify(trans, path);
-}
-
-static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
- struct btree_path *path,
- int check_pos)
-{
- unsigned i, l = path->level;
-again:
- while (btree_path_node(path, l) &&
- !btree_path_good_node(trans, path, l, check_pos))
- __btree_path_set_level_up(trans, path, l++);
-
- /* If we need intent locks, take them too: */
- for (i = l + 1;
- i < path->locks_want && btree_path_node(path, i);
- i++)
- if (!bch2_btree_node_relock(trans, path, i)) {
- while (l <= i)
- __btree_path_set_level_up(trans, path, l++);
- goto again;
- }
-
- return l;
-}
-
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
- struct btree_path *path,
- int check_pos)
-{
- return likely(btree_node_locked(path, path->level) &&
- btree_path_check_pos_in_node(path, path->level, check_pos))
- ? path->level
- : __btree_path_up_until_good_node(trans, path, check_pos);
-}
-
-/*
- * This is the main state machine for walking down the btree - walks down to a
- * specified depth
- *
- * Returns 0 on success, -EIO on error (error reading in a btree node).
- *
- * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch2_trans_exit().
- */
-int bch2_btree_path_traverse_one(struct btree_trans *trans,
- btree_path_idx_t path_idx,
- unsigned flags,
- unsigned long trace_ip)
-{
- struct btree_path *path = &trans->paths[path_idx];
- unsigned depth_want = path->level;
- int ret = -((int) trans->restarted);
-
- if (unlikely(ret))
- goto out;
-
- if (unlikely(!trans->srcu_held))
- bch2_trans_srcu_lock(trans);
-
- trace_btree_path_traverse_start(trans, path);
-
- /*
- * Ensure we obey path->should_be_locked: if it's set, we can't unlock
- * and re-traverse the path without a transaction restart:
- */
- if (path->should_be_locked) {
- ret = bch2_btree_path_relock(trans, path, trace_ip);
- goto out;
- }
-
- if (path->cached) {
- ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
- goto out;
- }
-
- path = &trans->paths[path_idx];
-
- if (unlikely(path->level >= BTREE_MAX_DEPTH))
- goto out_uptodate;
-
- path->level = btree_path_up_until_good_node(trans, path, 0);
- unsigned max_level = path->level;
-
- EBUG_ON(btree_path_node(path, path->level) &&
- !btree_node_locked(path, path->level));
-
- /*
- * Note: path->nodes[path->level] may be temporarily NULL here - that
- * would indicate to other code that we got to the end of the btree,
- * here it indicates that relocking the root failed - it's critical that
- * btree_path_lock_root() comes next and that it can't fail
- */
- while (path->level > depth_want) {
- ret = btree_path_node(path, path->level)
- ? btree_path_down(trans, path, flags, trace_ip)
- : btree_path_lock_root(trans, path, depth_want, trace_ip);
- if (unlikely(ret)) {
- if (ret == 1) {
- /*
- * No nodes at this level - got to the end of
- * the btree:
- */
- ret = 0;
- goto out;
- }
-
- __bch2_btree_path_unlock(trans, path);
- path->level = depth_want;
- path->l[path->level].b = ERR_PTR(ret);
- goto out;
- }
- }
-
- if (unlikely(max_level > path->level)) {
- struct btree_path *linked;
- unsigned iter;
-
- trans_for_each_path_with_node(trans, path_l(path)->b, linked, iter)
- for (unsigned j = path->level + 1; j < max_level; j++)
- linked->l[j] = path->l[j];
- }
-
-out_uptodate:
- path->uptodate = BTREE_ITER_UPTODATE;
- trace_btree_path_traverse_end(trans, path);
-out:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
- panic("ret %s (%i) trans->restarted %s (%i)\n",
- bch2_err_str(ret), ret,
- bch2_err_str(trans->restarted), trans->restarted);
- bch2_btree_path_verify(trans, path);
- return ret;
-}
-
-static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
- struct btree_path *src)
-{
- unsigned i, offset = offsetof(struct btree_path, pos);
-
- memcpy((void *) dst + offset,
- (void *) src + offset,
- sizeof(struct btree_path) - offset);
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++) {
- unsigned t = btree_node_locked_type(dst, i);
-
- if (t != BTREE_NODE_UNLOCKED)
- six_lock_increment(&dst->l[i].b->c.lock, t);
- }
-}
-
-static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
- bool intent, unsigned long ip)
-{
- btree_path_idx_t new = btree_path_alloc(trans, src);
- btree_path_copy(trans, trans->paths + new, trans->paths + src);
- __btree_path_get(trans, trans->paths + new, intent);
-#ifdef TRACK_PATH_ALLOCATED
- trans->paths[new].ip_allocated = ip;
-#endif
- return new;
-}
-
-__flatten
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
- btree_path_idx_t path, bool intent, unsigned long ip)
-{
- struct btree_path *old = trans->paths + path;
- __btree_path_put(trans, trans->paths + path, intent);
- path = btree_path_clone(trans, path, intent, ip);
- trace_btree_path_clone(trans, old, trans->paths + path);
- trans->paths[path].preserve = false;
- return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *trans,
- btree_path_idx_t path_idx, struct bpos new_pos,
- bool intent, unsigned long ip)
-{
- int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- EBUG_ON(!trans->paths[path_idx].ref);
-
- trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
-
- path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
-
- struct btree_path *path = trans->paths + path_idx;
- path->pos = new_pos;
- trans->paths_sorted = false;
-
- if (unlikely(path->cached)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
- goto out;
- }
-
- unsigned level = btree_path_up_until_good_node(trans, path, cmp);
-
- if (btree_path_node(path, level)) {
- struct btree_path_level *l = &path->l[level];
-
- BUG_ON(!btree_node_locked(path, level));
- /*
- * We might have to skip over many keys, or just a few: try
- * advancing the node iterator, and if we have to skip over too
- * many keys just reinit it (or if we're rewinding, since that
- * is expensive).
- */
- if (cmp < 0 ||
- !btree_path_advance_to_pos(path, l, 8))
- bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
-
- /*
- * Iterators to interior nodes should always be pointed at the first non
- * whiteout:
- */
- if (unlikely(level))
- bch2_btree_node_iter_peek(&l->iter, l->b);
- }
-
- if (unlikely(level != path->level)) {
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
- __bch2_btree_path_unlock(trans, path);
- }
-out:
- bch2_btree_path_verify(trans, path);
- return path_idx;
-}
-
-/* Btree path: main interface: */
-
-static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
- struct btree_path *sib;
-
- sib = prev_btree_path(trans, path);
- if (sib && !btree_path_cmp(sib, path))
- return sib;
-
- sib = next_btree_path(trans, path);
- if (sib && !btree_path_cmp(sib, path))
- return sib;
-
- return NULL;
-}
-
-static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
-{
- struct btree_path *sib;
-
- sib = prev_btree_path(trans, path);
- if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
- return sib;
-
- sib = next_btree_path(trans, path);
- if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
- return sib;
-
- return NULL;
-}
-
-static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
-{
- __bch2_btree_path_unlock(trans, trans->paths + path);
- btree_path_list_remove(trans, trans->paths + path);
- __clear_bit(path, trans->paths_allocated);
-}
-
-static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned l = path->level;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!is_btree_node(path, l))
- return false;
-
- if (path->l[l].lock_seq != path->l[l].b->c.lock.seq)
- return false;
-
- l++;
- } while (l < path->locks_want);
-
- return true;
-}
-
-void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
-{
- struct btree_path *path = trans->paths + path_idx, *dup = NULL;
-
- if (!__btree_path_put(trans, path, intent))
- return;
-
- if (!path->preserve && !path->should_be_locked)
- goto free;
-
- dup = path->preserve
- ? have_path_at_pos(trans, path)
- : have_node_at_pos(trans, path);
- if (!dup)
- return;
-
- /*
- * If we need this path locked, the duplicate also has te be locked
- * before we free this one:
- */
- if (path->should_be_locked &&
- !dup->should_be_locked &&
- !trans->restarted) {
- if (!(trans->locked
- ? bch2_btree_path_relock_norestart(trans, dup)
- : bch2_btree_path_can_relock(trans, dup)))
- return;
-
- dup->should_be_locked = true;
- }
-
- BUG_ON(path->should_be_locked &&
- !trans->restarted &&
- trans->locked &&
- !btree_node_locked(dup, dup->level));
-
- path->should_be_locked = false;
- dup->preserve |= path->preserve;
-free:
- trace_btree_path_free(trans, path_idx, dup);
- __bch2_path_free(trans, path_idx);
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
-{
- panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
- trans->restart_count, restart_count,
- (void *) trans->last_begin_ip);
-}
-
-static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct printbuf buf = PRINTBUF;
- bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
- panic("in transaction restart: %s, last restarted by\n%s",
- bch2_err_str(trans->restarted),
- buf.buf);
-#else
- panic("in transaction restart: %s, last restarted by %pS\n",
- bch2_err_str(trans->restarted),
- (void *) trans->last_restarted_ip);
-#endif
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
-{
- if (trans->restarted)
- bch2_trans_in_restart_error(trans);
-
- if (!trans->locked)
- panic("trans should be locked, unlocked by %pS\n",
- (void *) trans->last_unlock_ip);
-
- BUG();
-}
-
-noinline __cold
-void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
-{
- prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
- trans->nr_updates, trans->fn, trans->journal_res.seq);
- printbuf_indent_add(buf, 2);
-
- trans_for_each_update(trans, i) {
- struct bkey_s_c old = { &i->old_k, i->old_v };
-
- prt_str(buf, "update: btree=");
- bch2_btree_id_to_text(buf, i->btree_id);
- prt_printf(buf, " cached=%u %pS\n",
- i->cached,
- (void *) i->ip_allocated);
-
- prt_printf(buf, " old ");
- bch2_bkey_val_to_text(buf, trans->c, old);
- prt_newline(buf);
-
- prt_printf(buf, " new ");
- bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
- prt_newline(buf);
- }
-
- for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
- e != btree_trans_journal_entries_top(trans);
- e = vstruct_next(e)) {
- bch2_journal_entry_to_text(buf, trans->c, e);
- prt_newline(buf);
- }
-
- printbuf_indent_sub(buf, 2);
-}
-
-static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
- struct btree_path *path = trans->paths + path_idx;
-
- prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
- path_idx, path->ref, path->intent_ref,
- path->preserve ? 'P' : ' ',
- path->should_be_locked ? 'S' : ' ',
- path->cached ? 'C' : 'B');
- bch2_btree_id_level_to_text(out, path->btree_id, path->level);
- prt_str(out, " pos ");
- bch2_bpos_to_text(out, path->pos);
-
- if (!path->cached && btree_node_locked(path, path->level)) {
- prt_char(out, ' ');
- struct btree *b = path_l(path)->b;
- bch2_bpos_to_text(out, b->data->min_key);
- prt_char(out, '-');
- bch2_bpos_to_text(out, b->key.k.p);
- }
-
-#ifdef TRACK_PATH_ALLOCATED
- prt_printf(out, " %pS", (void *) path->ip_allocated);
-#endif
-}
-
-static const char *btree_node_locked_str(enum btree_node_locked_type t)
-{
- switch (t) {
- case BTREE_NODE_UNLOCKED:
- return "unlocked";
- case BTREE_NODE_READ_LOCKED:
- return "read";
- case BTREE_NODE_INTENT_LOCKED:
- return "intent";
- case BTREE_NODE_WRITE_LOCKED:
- return "write";
- default:
- return NULL;
- }
-}
-
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
-{
- bch2_btree_path_to_text_short(out, trans, path_idx);
-
- struct btree_path *path = trans->paths + path_idx;
-
- prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want);
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
- prt_printf(out, "l=%u locks %s seq %u node ", l,
- btree_node_locked_str(btree_node_locked_type(path, l)),
- path->l[l].lock_seq);
-
- int ret = PTR_ERR_OR_ZERO(path->l[l].b);
- if (ret)
- prt_str(out, bch2_err_str(ret));
- else
- prt_printf(out, "%px", path->l[l].b);
- prt_newline(out);
- }
- printbuf_indent_sub(out, 2);
-}
-
-static noinline __cold
-void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
- bool nosort)
-{
- struct trans_for_each_path_inorder_iter iter;
-
- if (!nosort)
- btree_trans_sort_paths(trans);
-
- trans_for_each_path_idx_inorder(trans, iter) {
- bch2_btree_path_to_text_short(out, trans, iter.path_idx);
- prt_newline(out);
- }
-}
-
-noinline __cold
-void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
-{
- __bch2_trans_paths_to_text(out, trans, false);
-}
-
-static noinline __cold
-void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
-{
- struct printbuf buf = PRINTBUF;
-
- __bch2_trans_paths_to_text(&buf, trans, nosort);
- bch2_trans_updates_to_text(&buf, trans);
-
- bch2_print_str(trans->c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-}
-
-noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
-{
- __bch2_dump_trans_paths_updates(trans, false);
-}
-
-noinline __cold
-static void bch2_trans_update_max_paths(struct btree_trans *trans)
-{
- struct btree_transaction_stats *s = btree_trans_stats(trans);
- struct printbuf buf = PRINTBUF;
- size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
-
- bch2_trans_paths_to_text(&buf, trans);
-
- if (!buf.allocation_failure) {
- mutex_lock(&s->lock);
- if (nr > s->nr_max_paths) {
- s->nr_max_paths = nr;
- swap(s->max_paths_text, buf.buf);
- }
- mutex_unlock(&s->lock);
- }
-
- printbuf_exit(&buf);
-
- trans->nr_paths_max = nr;
-}
-
-noinline __cold
-int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
-{
- if (trace_trans_restart_too_many_iters_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_trans_paths_to_text(&buf, trans);
- trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
- printbuf_exit(&buf);
- }
-
- count_event(trans->c, trans_restart_too_many_iters);
-
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
-}
-
-static noinline void btree_path_overflow(struct btree_trans *trans)
-{
- bch2_dump_trans_paths_updates(trans);
- bch_err(trans->c, "trans path overflow");
-}
-
-static noinline void btree_paths_realloc(struct btree_trans *trans)
-{
- unsigned nr = trans->nr_paths * 2;
-
- void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
- sizeof(struct btree_trans_paths) +
- nr * sizeof(struct btree_path) +
- nr * sizeof(btree_path_idx_t) + 8 +
- nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
-
- unsigned long *paths_allocated = p;
- memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
- p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
-
- p += sizeof(struct btree_trans_paths);
- struct btree_path *paths = p;
- *trans_paths_nr(paths) = nr;
- memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
- p += nr * sizeof(struct btree_path);
-
- btree_path_idx_t *sorted = p;
- memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
- p += nr * sizeof(btree_path_idx_t) + 8;
-
- struct btree_insert_entry *updates = p;
- memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
-
- unsigned long *old = trans->paths_allocated;
-
- rcu_assign_pointer(trans->paths_allocated, paths_allocated);
- rcu_assign_pointer(trans->paths, paths);
- rcu_assign_pointer(trans->sorted, sorted);
- rcu_assign_pointer(trans->updates, updates);
-
- trans->nr_paths = nr;
-
- if (old != trans->_paths_allocated)
- kfree_rcu_mightsleep(old);
-}
-
-static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
- btree_path_idx_t pos)
-{
- btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
-
- if (unlikely(idx == trans->nr_paths)) {
- if (trans->nr_paths == BTREE_ITER_MAX) {
- btree_path_overflow(trans);
- return 0;
- }
-
- btree_paths_realloc(trans);
- }
-
- /*
- * Do this before marking the new path as allocated, since it won't be
- * initialized yet:
- */
- if (unlikely(idx > trans->nr_paths_max))
- bch2_trans_update_max_paths(trans);
-
- __set_bit(idx, trans->paths_allocated);
-
- struct btree_path *path = &trans->paths[idx];
- path->ref = 0;
- path->intent_ref = 0;
- path->nodes_locked = 0;
-
- btree_path_list_add(trans, pos, idx);
- trans->paths_sorted = false;
- return idx;
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned level,
- unsigned flags, unsigned long ip)
-{
- struct btree_path *path;
- bool cached = flags & BTREE_ITER_cached;
- bool intent = flags & BTREE_ITER_intent;
- struct trans_for_each_path_inorder_iter iter;
- btree_path_idx_t path_pos = 0, path_idx;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_trans_verify_locks(trans);
-
- btree_trans_sort_paths(trans);
-
- if (intent)
- locks_want = max(locks_want, level + 1);
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
-
- trans_for_each_path_inorder(trans, path, iter) {
- if (__btree_path_cmp(path,
- btree_id,
- cached,
- pos,
- level) > 0)
- break;
-
- path_pos = iter.path_idx;
- }
-
- if (path_pos &&
- trans->paths[path_pos].cached == cached &&
- trans->paths[path_pos].btree_id == btree_id &&
- trans->paths[path_pos].level == level &&
- bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
- trace_btree_path_get(trans, trans->paths + path_pos, &pos);
-
- __btree_path_get(trans, trans->paths + path_pos, intent);
- path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
- path = trans->paths + path_idx;
- } else {
- path_idx = btree_path_alloc(trans, path_pos);
- path = trans->paths + path_idx;
-
- __btree_path_get(trans, path, intent);
- path->pos = pos;
- path->btree_id = btree_id;
- path->cached = cached;
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- path->should_be_locked = false;
- path->level = level;
- path->locks_want = locks_want;
- path->nodes_locked = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
- path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
-#ifdef TRACK_PATH_ALLOCATED
- path->ip_allocated = ip;
-#endif
- trans->paths_sorted = false;
-
- trace_btree_path_alloc(trans, path);
- }
-
- if (!(flags & BTREE_ITER_nopreserve))
- path->preserve = true;
-
- /*
- * If the path has locks_want greater than requested, we don't downgrade
- * it here - on transaction restart because btree node split needs to
- * upgrade locks, we might be putting/getting the iterator again.
- * Downgrading iterators only happens via bch2_trans_downgrade(), after
- * a successful transaction commit.
- */
-
- return path_idx;
-}
-
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
-{
- btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
- BTREE_ITER_nopreserve|
- BTREE_ITER_intent, _RET_IP_);
- path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
-
- struct btree_path *path = trans->paths + path_idx;
- bch2_btree_path_downgrade(trans, path);
- __bch2_btree_path_unlock(trans, path);
- return path_idx;
-}
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
-{
-
- struct btree_path_level *l = path_l(path);
- struct bkey_packed *_k;
- struct bkey_s_c k;
-
- if (unlikely(!l->b))
- return bkey_s_c_null;
-
- EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
- EBUG_ON(!btree_node_locked(path, path->level));
-
- if (!path->cached) {
- _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
-
- EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
-
- if (!k.k || !bpos_eq(path->pos, k.k->p))
- goto hole;
- } else {
- struct bkey_cached *ck = (void *) path->l[0].b;
- if (!ck)
- return bkey_s_c_null;
-
- EBUG_ON(path->btree_id != ck->key.btree_id ||
- !bkey_eq(path->pos, ck->key.pos));
-
- *u = ck->k->k;
- k = (struct bkey_s_c) { u, &ck->k->v };
- }
-
- return k;
-hole:
- bkey_init(u);
- u->p = path->pos;
- return (struct bkey_s_c) { u, NULL };
-}
-
-void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (!iter->path || trans->restarted)
- return;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- path->preserve = false;
- if (path->ref == 1)
- path->should_be_locked = false;
-}
-/* Btree iterators: */
-
-int __must_check
-__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
-{
- return bch2_btree_path_traverse(trans, iter->path, iter->flags);
-}
-
-int __must_check
-bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path,
- btree_iter_search_key(iter),
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- return ret;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- if (btree_path_node(path, path->level))
- btree_path_set_should_be_locked(trans, path);
- return 0;
-}
-
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- struct btree *b = NULL;
- int ret;
-
- EBUG_ON(trans->paths[iter->path].cached);
- bch2_btree_iter_verify(trans, iter);
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- goto err;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- b = btree_path_node(path, path->level);
- if (!b)
- goto out;
-
- BUG_ON(bpos_lt(b->key.k.p, iter->pos));
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = b->key.k.p;
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out:
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(trans, iter);
-
- return b;
-err:
- b = ERR_PTR(ret);
- goto out;
-}
-
-/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- struct btree *b;
-
- while (b = bch2_btree_iter_peek_node(trans, iter),
- bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
- bch2_trans_begin(trans);
-
- return b;
-}
-
-struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
-{
- struct btree *b = NULL;
- int ret;
-
- EBUG_ON(trans->paths[iter->path].cached);
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(trans, iter);
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- goto err;
-
-
- struct btree_path *path = btree_iter_path(trans, iter);
-
- /* already at end? */
- if (!btree_path_node(path, path->level))
- return NULL;
-
- /* got to end? */
- if (!btree_path_node(path, path->level + 1)) {
- path->should_be_locked = false;
- btree_path_set_level_up(trans, path);
- return NULL;
- }
-
- /*
- * We don't correctly handle nodes with extra intent locks here:
- * downgrade so we don't violate locking invariants
- */
- bch2_btree_path_downgrade(trans, path);
-
- if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
- trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
- __bch2_btree_path_unlock(trans, path);
- path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
- goto err;
- }
-
- b = btree_path_node(path, path->level + 1);
-
- if (bpos_eq(iter->pos, b->key.k.p)) {
- __btree_path_set_level_up(trans, path, path->level++);
- } else {
- if (btree_lock_want(path, path->level + 1) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(trans, path, path->level + 1);
-
- /*
- * Haven't gotten to the end of the parent node: go back down to
- * the next child node
- */
- iter->path = bch2_btree_path_set_pos(trans, iter->path,
- bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- path = btree_iter_path(trans, iter);
- btree_path_set_level_down(trans, path, iter->min_depth);
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (ret)
- goto err;
-
- path = btree_iter_path(trans, iter);
- b = path->l[path->level].b;
- }
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = b->key.k.p;
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
- EBUG_ON(btree_iter_path(trans, iter)->uptodate);
-out:
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(trans, iter);
-
- return b;
-err:
- b = ERR_PTR(ret);
- goto out;
-}
-
-/* Iterate across keys (in leaf nodes only) */
-
-inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
-{
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_all_snapshots
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_is_extents))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(trans, iter, pos);
- return ret;
-}
-
-inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
-{
- struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = !(iter->flags & BTREE_ITER_all_snapshots
- ? bpos_eq(pos, POS_MIN)
- : bkey_eq(pos, POS_MIN));
-
- if (ret && !(iter->flags & BTREE_ITER_is_extents))
- pos = bkey_predecessor(iter, pos);
- bch2_btree_iter_set_pos(trans, iter, pos);
- return ret;
-}
-
-static noinline
-void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos search_key, struct bkey_s_c *k)
-{
- struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
-
- trans_for_each_update(trans, i)
- if (!i->key_cache_already_flushed &&
- i->btree_id == iter->btree_id &&
- bpos_le(i->k->k.p, search_key) &&
- bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
- iter->k = i->k->k;
- *k = bkey_i_to_s_c(i->k);
- }
-}
-
-static noinline
-void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos search_key,
- struct bkey_s_c *k)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bpos end = path_l(path)->b->key.k.p;
-
- trans_for_each_update(trans, i)
- if (!i->key_cache_already_flushed &&
- i->btree_id == iter->btree_id &&
- bpos_ge(i->k->k.p, search_key) &&
- bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
- iter->k = i->k->k;
- *k = bkey_i_to_s_c(i->k);
- }
-}
-
-static noinline
-void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
-{
- trans_for_each_update(trans, i)
- if (!i->key_cache_already_flushed &&
- i->btree_id == iter->btree_id &&
- bpos_eq(i->k->k.p, iter->pos)) {
- iter->k = i->k->k;
- *k = bkey_i_to_s_c(i->k);
- }
-}
-
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_pos,
- struct bpos end_pos)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
-
- return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
- path->level,
- search_pos,
- end_pos,
- &iter->journal_idx);
-}
-
-static noinline
-struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
-
- if (k) {
- iter->k = k->k;
- return bkey_i_to_s_c(k);
- } else {
- return bkey_s_c_null;
- }
-}
-
-static noinline
-void btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_key,
- struct bkey_s_c *k)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
- bch2_btree_journal_peek(trans, iter, search_key,
- k->k ? k->k->p : path_l(path)->b->key.k.p);
- if (next_journal) {
- iter->k = next_journal->k;
- *k = bkey_i_to_s_c(next_journal);
- }
-}
-
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_key,
- struct bpos end_pos)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
-
- return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
- path->level,
- search_key,
- end_pos,
- &iter->journal_idx);
-}
-
-static noinline
-void btree_trans_peek_prev_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_key,
- struct bkey_s_c *k)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
- bch2_btree_journal_peek_prev(trans, iter, search_key,
- k->k ? k->k->p : path_l(path)->b->data->min_key);
-
- if (next_journal) {
- iter->k = next_journal->k;
- *k = bkey_i_to_s_c(next_journal);
- }
-}
-
-/*
- * Checks btree key cache for key at iter->pos and returns it if present, or
- * bkey_s_c_null:
- */
-static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct bkey u;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if ((iter->flags & BTREE_ITER_key_cache_fill) &&
- bpos_eq(iter->pos, pos))
- return bkey_s_c_null;
-
- if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
- return bkey_s_c_null;
-
- if (!iter->key_cache_path)
- iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
- iter->flags & BTREE_ITER_intent, 0,
- iter->flags|BTREE_ITER_cached|
- BTREE_ITER_cached_nofill,
- _THIS_IP_);
-
- iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- iter->flags|BTREE_ITER_cached) ?:
- bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
- if (!k.k)
- return k;
-
- if ((iter->flags & BTREE_ITER_all_snapshots) &&
- !bpos_eq(pos, k.k->p))
- return bkey_s_c_null;
-
- iter->k = u;
- k.k = &iter->k;
- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
- return k;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos search_key)
-{
- struct bkey_s_c k, k2;
- int ret;
-
- EBUG_ON(btree_iter_path(trans, iter)->cached);
- bch2_btree_iter_verify(trans, iter);
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(trans, iter, iter->pos);
- k = bkey_s_c_err(ret);
- break;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- struct btree_path_level *l = path_l(path);
-
- if (unlikely(!l->b)) {
- /* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
- k = bkey_s_c_null;
- break;
- }
-
- btree_path_set_should_be_locked(trans, path);
-
- k = btree_path_level_peek_all(trans->c, l, &iter->k);
-
- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- k.k &&
- (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
- k = k2;
- if (bkey_err(k)) {
- bch2_btree_iter_set_pos(trans, iter, iter->pos);
- break;
- }
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_journal(trans, iter, search_key, &k);
-
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates))
- bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
-
- if (k.k && bkey_deleted(k.k)) {
- /*
- * If we've got a whiteout, and it's after the search
- * key, advance the search key to the whiteout instead
- * of just after the whiteout - it might be a btree
- * whiteout, with a real key at the same position, since
- * in the btree deleted keys sort before non deleted.
- */
- search_key = !bpos_eq(search_key, k.k->p)
- ? k.k->p
- : bpos_successor(k.k->p);
- continue;
- }
-
- if (likely(k.k)) {
- break;
- } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
- /* Advance to next leaf node: */
- search_key = bpos_successor(l->b->key.k.p);
- } else {
- /* End of btree: */
- bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
- k = bkey_s_c_null;
- break;
- }
- }
-
- bch2_btree_iter_verify(trans, iter);
-
- if (trace___btree_iter_peek_enabled()) {
- CLASS(printbuf, buf)();
-
- int ret = bkey_err(k);
- if (ret)
- prt_str(&buf, bch2_err_str(ret));
- else if (k.k)
- bch2_bkey_val_to_text(&buf, trans->c, k);
- else
- prt_str(&buf, "(null)");
- trace___btree_iter_peek(trans->c, buf.buf);
- }
-
- return k;
-}
-
-/**
- * bch2_btree_iter_peek_max() - returns first key greater than or equal to
- * iterator's current position
- * @trans: btree transaction object
- * @iter: iterator to peek from
- * @end: search limit: returns keys less than or equal to @end
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end)
-{
- struct bpos search_key = btree_iter_search_key(iter);
- struct bkey_s_c k;
- struct bpos iter_pos = iter->pos;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- if (iter->update_path) {
- bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
- }
-
- while (1) {
- k = __bch2_btree_iter_peek(trans, iter, search_key);
- if (unlikely(!k.k))
- goto end;
- if (unlikely(bkey_err(k)))
- goto out_no_locked;
-
- if (iter->flags & BTREE_ITER_filter_snapshots) {
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- *
- * But we can't do the full check here, because bkey_start_pos()
- * isn't monotonically increasing before FILTER_SNAPSHOTS, and
- * that's what we check against in extents mode:
- */
- if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
- ? bkey_gt(k.k->p, end)
- : k.k->p.inode > end.inode))
- goto end;
-
- if (iter->update_path &&
- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
- }
-
- if ((iter->flags & BTREE_ITER_intent) &&
- !(iter->flags & BTREE_ITER_is_extents) &&
- !iter->update_path) {
- struct bpos pos = k.k->p;
-
- if (pos.snapshot < iter->snapshot) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
-
- pos.snapshot = iter->snapshot;
-
- /*
- * advance, same as on exit for iter->path, but only up
- * to snapshot
- */
- __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
- iter->update_path = iter->path;
-
- iter->update_path = bch2_btree_path_set_pos(trans,
- iter->update_path, pos,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
- }
-
- /*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
- */
- if (!bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
-
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_key_cache_fill)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
- }
- }
-
- /*
- * iter->pos should be mononotically increasing, and always be
- * equal to the key we just returned - except extents can
- * straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_is_extents))
- iter_pos = k.k->p;
- else
- iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
-
- if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
- iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
- bkey_gt(iter_pos, end)))
- goto end;
-
- break;
- }
-
- iter->pos = iter_pos;
-
- iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
- if (iter->update_path) {
- ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
- if (unlikely(ret))
- k = bkey_s_c_err(ret);
- else
- btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
- }
-
- if (!(iter->flags & BTREE_ITER_all_snapshots))
- iter->pos.snapshot = iter->snapshot;
-
- ret = bch2_btree_iter_verify_ret(trans, iter, k);
- if (unlikely(ret)) {
- bch2_btree_iter_set_pos(trans, iter, iter->pos);
- k = bkey_s_c_err(ret);
- }
-
- bch2_btree_iter_verify_entry_exit(iter);
-
- if (trace_btree_iter_peek_max_enabled()) {
- CLASS(printbuf, buf)();
-
- int ret = bkey_err(k);
- if (ret)
- prt_str(&buf, bch2_err_str(ret));
- else if (k.k)
- bch2_bkey_val_to_text(&buf, trans->c, k);
- else
- prt_str(&buf, "(null)");
- trace_btree_iter_peek_max(trans->c, buf.buf);
- }
-
- return k;
-end:
- bch2_btree_iter_set_pos(trans, iter, end);
- k = bkey_s_c_null;
- goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_next() - returns first key greater than iterator's current
- * position
- * @trans: btree transaction object
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (!bch2_btree_iter_advance(trans, iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek(trans, iter);
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos search_key)
-{
- struct bkey_s_c k, k2;
-
- bch2_btree_iter_verify(trans, iter);
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(trans, iter, iter->pos);
- k = bkey_s_c_err(ret);
- break;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- struct btree_path_level *l = path_l(path);
-
- if (unlikely(!l->b)) {
- /* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
- k = bkey_s_c_null;
- break;
- }
-
- btree_path_set_should_be_locked(trans, path);
-
- k = btree_path_level_peek_all(trans->c, l, &iter->k);
- if (!k.k || bpos_gt(k.k->p, search_key)) {
- k = btree_path_level_prev(trans, path, l, &iter->k);
-
- BUG_ON(k.k && bpos_gt(k.k->p, search_key));
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- k.k &&
- (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
- k = k2;
- if (bkey_err(k2)) {
- bch2_btree_iter_set_pos(trans, iter, iter->pos);
- break;
- }
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_prev_journal(trans, iter, search_key, &k);
-
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates))
- bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
-
- if (likely(k.k && !bkey_deleted(k.k))) {
- break;
- } else if (k.k) {
- search_key = bpos_predecessor(k.k->p);
- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
- /* Advance to previous leaf node: */
- search_key = bpos_predecessor(path->l[0].b->data->min_key);
- } else {
- /* Start of btree: */
- bch2_btree_iter_set_pos(trans, iter, POS_MIN);
- k = bkey_s_c_null;
- break;
- }
- }
-
- bch2_btree_iter_verify(trans, iter);
- return k;
-}
-
-/**
- * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
- * iterator's current position
- * @trans: btree transaction object
- * @iter: iterator to peek from
- * @end: search limit: returns keys greater than or equal to @end
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end)
-{
- if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
- !bkey_eq(iter->pos, POS_MAX) &&
- !((iter->flags & BTREE_ITER_is_extents) &&
- iter->pos.offset == U64_MAX)) {
-
- /*
- * bkey_start_pos(), for extents, is not monotonically
- * increasing until after filtering for snapshots:
- *
- * Thus, for extents we need to search forward until we find a
- * real visible extents - easiest to just use peek_slot() (which
- * internally uses peek() for extents)
- */
- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
- if (bkey_err(k))
- return k;
-
- if (!bkey_deleted(k.k) &&
- (!(iter->flags & BTREE_ITER_is_extents) ||
- bkey_lt(bkey_start_pos(k.k), iter->pos)))
- return k;
- }
-
- struct bpos search_key = iter->pos;
- struct bkey_s_c k;
- btree_path_idx_t saved_path = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
-
- int ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- while (1) {
- k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
- if (unlikely(!k.k))
- goto end;
- if (unlikely(bkey_err(k)))
- goto out_no_locked;
-
- if (iter->flags & BTREE_ITER_filter_snapshots) {
- struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
- if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
- /*
- * If we have a saved candidate, and we're past
- * the last possible snapshot overwrite, return
- * it:
- */
- bch2_path_put(trans, iter->path,
- iter->flags & BTREE_ITER_intent);
- iter->path = saved_path;
- saved_path = 0;
- k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
- break;
- }
-
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- */
- if (unlikely(bkey_lt(k.k->p, end)))
- goto end;
-
- if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
- search_key = bpos_predecessor(k.k->p);
- continue;
- }
-
- if (k.k->p.snapshot != iter->snapshot) {
- /*
- * Have a key visible in iter->snapshot, but
- * might have overwrites: - save it and keep
- * searching. Unless it's a whiteout - then drop
- * our previous saved candidate:
- */
- if (saved_path) {
- bch2_path_put(trans, saved_path,
- iter->flags & BTREE_ITER_intent);
- saved_path = 0;
- }
-
- if (!bkey_whiteout(k.k)) {
- saved_path = btree_path_clone(trans, iter->path,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
- trace_btree_path_save_pos(trans,
- trans->paths + iter->path,
- trans->paths + saved_path);
- }
-
- search_key = bpos_predecessor(k.k->p);
- continue;
- }
-
- if (bkey_whiteout(k.k)) {
- search_key = bkey_predecessor(iter, k.k->p);
- search_key.snapshot = U32_MAX;
- continue;
- }
- }
-
- EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
- iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
- bkey_gt(k.k->p, iter->pos));
-
- if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
- iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
- bkey_lt(k.k->p, end)))
- goto end;
-
- break;
- }
-
- /* Extents can straddle iter->pos: */
- iter->pos = bpos_min(iter->pos, k.k->p);;
-
- if (iter->flags & BTREE_ITER_filter_snapshots)
- iter->pos.snapshot = iter->snapshot;
-out_no_locked:
- if (saved_path)
- bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
-
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(trans, iter);
-
- if (trace_btree_iter_peek_prev_min_enabled()) {
- CLASS(printbuf, buf)();
-
- int ret = bkey_err(k);
- if (ret)
- prt_str(&buf, bch2_err_str(ret));
- else if (k.k)
- bch2_bkey_val_to_text(&buf, trans->c, k);
- else
- prt_str(&buf, "(null)");
- trace_btree_iter_peek_prev_min(trans->c, buf.buf);
- }
- return k;
-end:
- bch2_btree_iter_set_pos(trans, iter, end);
- k = bkey_s_c_null;
- goto out_no_locked;
-}
-
-/**
- * bch2_btree_iter_prev() - returns first key less than iterator's current
- * position
- * @trans: btree transaction object
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (!bch2_btree_iter_rewind(trans, iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_prev(trans, iter);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
- struct bpos search_key;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(trans, iter);
- bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out;
- }
-
- /* extents can't span inode numbers: */
- if ((iter->flags & BTREE_ITER_is_extents) &&
- unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
- if (iter->pos.inode == KEY_INODE_MAX) {
- k = bkey_s_c_null;
- goto out2;
- }
-
- bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
- }
-
- search_key = btree_iter_search_key(iter);
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- if (unlikely(!btree_path_node(path, path->level))) {
- k = bkey_s_c_null;
- goto out2;
- }
-
- btree_path_set_should_be_locked(trans, path);
-
- if ((iter->flags & BTREE_ITER_cached) ||
- !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
- k = bkey_s_c_null;
-
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates)) {
- bch2_btree_trans_peek_slot_updates(trans, iter, &k);
- if (k.k)
- goto out;
- }
-
- if (unlikely(iter->flags & BTREE_ITER_with_journal) &&
- (k = btree_trans_peek_slot_journal(trans, iter)).k)
- goto out;
-
- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
- if (!bkey_err(k))
- iter->k = *k.k;
- /* We're not returning a key from iter->path: */
- goto out;
- }
-
- k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
- if (unlikely(!k.k))
- goto out;
-
- if (unlikely(k.k->type == KEY_TYPE_whiteout &&
- (iter->flags & BTREE_ITER_filter_snapshots) &&
- !(iter->flags & BTREE_ITER_key_cache_fill)))
- iter->k.type = KEY_TYPE_deleted;
- } else {
- struct bpos next;
- struct bpos end = iter->pos;
-
- if (iter->flags & BTREE_ITER_is_extents)
- end.offset = U64_MAX;
-
- EBUG_ON(btree_iter_path(trans, iter)->level);
-
- if (iter->flags & BTREE_ITER_intent) {
- struct btree_iter iter2;
-
- bch2_trans_copy_iter(trans, &iter2, iter);
- k = bch2_btree_iter_peek_max(trans, &iter2, end);
-
- if (k.k && !bkey_err(k)) {
- swap(iter->key_cache_path, iter2.key_cache_path);
- iter->k = iter2.k;
- k.k = &iter->k;
- }
- bch2_trans_iter_exit(trans, &iter2);
- } else {
- struct bpos pos = iter->pos;
-
- k = bch2_btree_iter_peek_max(trans, iter, end);
- if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(trans, iter, pos);
- else
- iter->pos = pos;
- }
-
- if (unlikely(bkey_err(k)))
- goto out;
-
- next = k.k ? bkey_start_pos(k.k) : POS_MAX;
-
- if (bkey_lt(iter->pos, next)) {
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
-
- if (iter->flags & BTREE_ITER_is_extents) {
- bch2_key_resize(&iter->k,
- min_t(u64, KEY_SIZE_MAX,
- (next.inode == iter->pos.inode
- ? next.offset
- : KEY_OFFSET_MAX) -
- iter->pos.offset));
- EBUG_ON(!iter->k.size);
- }
-
- k = (struct bkey_s_c) { &iter->k, NULL };
- }
- }
-out:
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(trans, iter);
- ret = bch2_btree_iter_verify_ret(trans, iter, k);
- if (unlikely(ret))
- k = bkey_s_c_err(ret);
-out2:
- if (trace_btree_iter_peek_slot_enabled()) {
- CLASS(printbuf, buf)();
-
- int ret = bkey_err(k);
- if (ret)
- prt_str(&buf, bch2_err_str(ret));
- else if (k.k)
- bch2_bkey_val_to_text(&buf, trans->c, k);
- else
- prt_str(&buf, "(null)");
- trace_btree_iter_peek_slot(trans->c, buf.buf);
- }
-
- return k;
-}
-
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (!bch2_btree_iter_advance(trans, iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (!bch2_btree_iter_rewind(trans, iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
-/* new transactional stuff: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
-
- trans_for_each_path(trans, path, i) {
- BUG_ON(path->sorted_idx >= trans->nr_sorted);
- BUG_ON(trans->sorted[path->sorted_idx] != i);
- }
-
- for (i = 0; i < trans->nr_sorted; i++) {
- unsigned idx = trans->sorted[i];
-
- BUG_ON(!test_bit(idx, trans->paths_allocated));
- BUG_ON(trans->paths[idx].sorted_idx != i);
- }
-}
-
-static void btree_trans_verify_sorted(struct btree_trans *trans)
-{
- struct btree_path *path, *prev = NULL;
- struct trans_for_each_path_inorder_iter iter;
-
- if (!static_branch_unlikely(&bch2_debug_check_iterators))
- return;
-
- trans_for_each_path_inorder(trans, path, iter) {
- if (prev && btree_path_cmp(prev, path) > 0) {
- __bch2_dump_trans_paths_updates(trans, true);
- panic("trans paths out of order!\n");
- }
- prev = path;
- }
-}
-#else
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
-static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
-#endif
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
-{
- int i, l = 0, r = trans->nr_sorted, inc = 1;
- bool swapped;
-
- btree_trans_verify_sorted_refs(trans);
-
- if (trans->paths_sorted)
- goto out;
-
- /*
- * Cocktail shaker sort: this is efficient because iterators will be
- * mostly sorted.
- */
- do {
- swapped = false;
-
- for (i = inc > 0 ? l : r - 2;
- i + 1 < r && i >= l;
- i += inc) {
- if (btree_path_cmp(trans->paths + trans->sorted[i],
- trans->paths + trans->sorted[i + 1]) > 0) {
- swap(trans->sorted[i], trans->sorted[i + 1]);
- trans->paths[trans->sorted[i]].sorted_idx = i;
- trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
- swapped = true;
- }
- }
-
- if (inc > 0)
- --r;
- else
- l++;
- inc = -inc;
- } while (swapped);
-
- trans->paths_sorted = true;
-out:
- btree_trans_verify_sorted(trans);
-}
-
-static inline void btree_path_list_remove(struct btree_trans *trans,
- struct btree_path *path)
-{
- EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- trans->nr_sorted--;
- memmove_u64s_down_small(trans->sorted + path->sorted_idx,
- trans->sorted + path->sorted_idx + 1,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
- sizeof(u64) / sizeof(btree_path_idx_t)));
-#else
- array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-#endif
- for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
- trans->paths[trans->sorted[i]].sorted_idx = i;
-}
-
-static inline void btree_path_list_add(struct btree_trans *trans,
- btree_path_idx_t pos,
- btree_path_idx_t path_idx)
-{
- struct btree_path *path = trans->paths + path_idx;
-
- path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
- trans->sorted + path->sorted_idx,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
- sizeof(u64) / sizeof(btree_path_idx_t)));
- trans->nr_sorted++;
- trans->sorted[path->sorted_idx] = path_idx;
-#else
- array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
-#endif
-
- for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
- trans->paths[trans->sorted[i]].sorted_idx = i;
-
- btree_trans_verify_sorted_refs(trans);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
-{
- if (iter->update_path)
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- if (iter->path)
- bch2_path_put(trans, iter->path,
- iter->flags & BTREE_ITER_intent);
- if (iter->key_cache_path)
- bch2_path_put(trans, iter->key_cache_path,
- iter->flags & BTREE_ITER_intent);
- iter->path = 0;
- iter->update_path = 0;
- iter->key_cache_path = 0;
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_id btree_id, struct bpos pos,
- unsigned flags)
-{
- bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, 0, flags),
- _RET_IP_);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_id btree_id,
- struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags)
-{
- flags |= BTREE_ITER_not_extents;
- flags |= BTREE_ITER_snapshot_field;
- flags |= BTREE_ITER_all_snapshots;
-
- if (!depth && btree_id_cached(trans->c, btree_id))
- flags |= BTREE_ITER_with_key_cache;
-
- bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
- bch2_btree_iter_flags(trans, btree_id, depth, flags),
- _RET_IP_);
-
- iter->min_depth = depth;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(path->level != depth);
- BUG_ON(iter->min_depth != depth);
-}
-
-void bch2_trans_copy_iter(struct btree_trans *trans,
- struct btree_iter *dst, struct btree_iter *src)
-{
- *dst = *src;
-#ifdef TRACK_PATH_ALLOCATED
- dst->ip_allocated = _RET_IP_;
-#endif
- if (src->path)
- __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
- if (src->update_path)
- __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
- dst->key_cache_path = 0;
-}
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
- darray_trans_kmalloc_trace *trace)
-{
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 60);
-
- darray_for_each(*trace, i)
- prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
-}
-#endif
-
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
-{
- struct bch_fs *c = trans->c;
- unsigned new_top = trans->mem_top + size;
- unsigned old_bytes = trans->mem_bytes;
- unsigned new_bytes = roundup_pow_of_two(new_top);
- int ret;
- void *new_mem;
- void *p;
-
- if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
- BTREE_TRANS_MEM_MAX);
-
- bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-#endif
- }
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (ret)
- return ERR_PTR(ret);
-
- struct btree_transaction_stats *s = btree_trans_stats(trans);
- if (new_bytes > s->max_mem) {
- mutex_lock(&s->lock);
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
- s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
- trans->trans_kmalloc_trace.nr);
-
- memcpy(s->trans_kmalloc_trace.data,
- trans->trans_kmalloc_trace.data,
- sizeof(s->trans_kmalloc_trace.data[0]) *
- s->trans_kmalloc_trace.nr);
-#endif
- s->max_mem = new_bytes;
- mutex_unlock(&s->lock);
- }
-
- if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
- EBUG_ON(trans->mem_bytes >= new_bytes);
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
- }
-
- if (old_bytes) {
- trans->realloc_bytes_required = new_bytes;
- trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart_ip(trans,
- BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
- }
-
- EBUG_ON(trans->mem);
-
- new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
-
- new_mem = kmalloc(new_bytes, GFP_KERNEL);
- if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
- new_bytes = BTREE_TRANS_MEM_MAX;
- trans->used_mempool = true;
- }
-
- EBUG_ON(!new_mem);
-
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
-
- ret = bch2_trans_relock(trans);
- if (ret)
- return ERR_PTR(ret);
- }
-
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
-
- p = trans->mem + trans->mem_top;
- trans->mem_top += size;
- memset(p, 0, size);
- return p;
-}
-
-static inline void check_srcu_held_too_long(struct btree_trans *trans)
-{
- WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
- "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
- (jiffies - trans->srcu_lock_time) / HZ);
-}
-
-void bch2_trans_srcu_unlock(struct btree_trans *trans)
-{
- if (trans->srcu_held) {
- struct bch_fs *c = trans->c;
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->cached && !btree_node_locked(path, 0))
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
-
- check_srcu_held_too_long(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- trans->srcu_held = false;
- }
-}
-
-static void bch2_trans_srcu_lock(struct btree_trans *trans)
-{
- if (!trans->srcu_held) {
- trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
- }
-}
-
-/**
- * bch2_trans_begin() - reset a transaction after a interrupted attempt
- * @trans: transaction to reset
- *
- * Returns: current restart counter, to be used with trans_was_restarted()
- *
- * While iterating over nodes or updating nodes a attempt to lock a btree node
- * may return BCH_ERR_transaction_restart when the trylock fails. When this
- * occurs bch2_trans_begin() should be called and the transaction retried.
- */
-u32 bch2_trans_begin(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
- u64 now;
-
- bch2_trans_reset_updates(trans);
-
- trans->restart_count++;
- trans->mem_top = 0;
-
- if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
- EBUG_ON(!trans->mem || !trans->mem_bytes);
- unsigned new_bytes = trans->realloc_bytes_required;
- void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
- new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
-
- EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
-
- if (!new_mem) {
- new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
- new_bytes = BTREE_TRANS_MEM_MAX;
- trans->used_mempool = true;
- kfree(trans->mem);
- }
- }
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
- }
-
- trans_for_each_path(trans, path, i) {
- path->should_be_locked = false;
-
- /*
- * If the transaction wasn't restarted, we're presuming to be
- * doing something new: dont keep iterators excpt the ones that
- * are in use - except for the subvolumes btree:
- */
- if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
- path->preserve = false;
-
- /*
- * XXX: we probably shouldn't be doing this if the transaction
- * was restarted, but currently we still overflow transaction
- * iterators if we do that
- */
- if (!path->ref && !path->preserve)
- __bch2_path_free(trans, i);
- else
- path->preserve = false;
- }
-
- now = local_clock();
-
- if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
- time_after64(now, trans->last_begin_time + 10))
- __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
- trans->last_begin_time, now);
-
- if (!trans->restarted &&
- (need_resched() ||
- time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
- bch2_trans_unlock(trans);
- cond_resched();
- now = local_clock();
- }
- trans->last_begin_time = now;
-
- if (unlikely(trans->srcu_held &&
- time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
- bch2_trans_srcu_unlock(trans);
-
- trans->last_begin_ip = _RET_IP_;
-
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
- if (trans->restarted) {
- trans->restart_count_this_trans++;
- } else {
- trans->restart_count_this_trans = 0;
- }
-#endif
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- trans->trans_kmalloc_trace.nr = 0;
-#endif
-
- trans_set_locked(trans, false);
-
- if (trans->restarted) {
- bch2_btree_path_traverse_all(trans);
- trans->notrace_relock_fail = false;
- }
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- return trans->restart_count;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
-
-unsigned bch2_trans_get_fn_idx(const char *fn)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
- if (!bch2_btree_transaction_fns[i] ||
- bch2_btree_transaction_fns[i] == fn) {
- bch2_btree_transaction_fns[i] = fn;
- return i;
- }
-
- pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
- return 0;
-}
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
- __acquires(&c->btree_trans_barrier)
-{
- struct btree_trans *trans;
-
- if (IS_ENABLED(__KERNEL__)) {
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
- if (trans) {
- memset(trans, 0, offsetof(struct btree_trans, list));
- goto got_trans;
- }
- }
-
- trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
- memset(trans, 0, sizeof(*trans));
-
- seqmutex_lock(&c->btree_trans_lock);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- struct btree_trans *pos;
- pid_t pid = current->pid;
-
- trans->locking_wait.task = current;
-
- list_for_each_entry(pos, &c->btree_trans_list, list) {
- struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
- /*
- * We'd much prefer to be stricter here and completely
- * disallow multiple btree_trans in the same thread -
- * but the data move path calls bch2_write when we
- * already have a btree_trans initialized.
- */
- BUG_ON(pos_task &&
- pid == pos_task->pid &&
- pos->locked);
- }
- }
-
- list_add(&trans->list, &c->btree_trans_list);
- seqmutex_unlock(&c->btree_trans_lock);
-got_trans:
- trans->c = c;
- trans->last_begin_time = local_clock();
- trans->fn_idx = fn_idx;
- trans->locking_wait.task = current;
- trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
- atomic_inc_not_zero(&c->journal_keys.ref);
- trans->nr_paths = ARRAY_SIZE(trans->_paths);
- trans->paths_allocated = trans->_paths_allocated;
- trans->sorted = trans->_sorted;
- trans->paths = trans->_paths;
- trans->updates = trans->_updates;
-
- *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
-
- trans->paths_allocated[0] = 1;
-
- static struct lock_class_key lockdep_key;
- lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
-
- if (fn_idx < BCH_TRANSACTIONS_NR) {
- trans->fn = bch2_btree_transaction_fns[fn_idx];
-
- struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
-
- if (s->max_mem) {
- unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
- trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
- if (likely(trans->mem))
- trans->mem_bytes = expected_mem_bytes;
- }
-
- trans->nr_paths_max = s->nr_max_paths;
- }
-
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
- trans_set_locked(trans, false);
-
- closure_init_stack_release(&trans->ref);
- return trans;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static bool btree_paths_leaked(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->ref)
- return true;
- return false;
-}
-
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
- if (btree_paths_leaked(trans)) {
- struct bch_fs *c = trans->c;
- struct btree_path *path;
- unsigned i;
-
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
- trans_for_each_path(trans, path, i)
- if (path->ref)
- prt_printf(&buf, "btree %s %pS\n",
- bch2_btree_id_str(path->btree_id),
- (void *) path->ip_allocated);
-
- bch2_fs_emergency_read_only2(c, &buf);
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-}
-#else
-static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
-#endif
-
-void bch2_trans_put(struct btree_trans *trans)
- __releases(&c->btree_trans_barrier)
-{
- struct bch_fs *c = trans->c;
-
- if (trans->restarted)
- bch2_trans_in_restart_error(trans);
-
- bch2_trans_unlock(trans);
-
- trans_for_each_update(trans, i)
- __btree_path_put(trans, trans->paths + i->path, true);
- trans->nr_updates = 0;
-
- check_btree_paths_leaked(trans);
-
- if (trans->srcu_held) {
- check_srcu_held_too_long(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- }
-
- if (unlikely(trans->journal_replay_not_finished))
- bch2_journal_keys_put(c);
-
- /*
- * trans->ref protects trans->locking_wait.task, btree_paths array; used
- * by cycle detector
- */
- closure_return_sync(&trans->ref);
- trans->locking_wait.task = NULL;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- darray_exit(&trans->last_restarted_trace);
-#endif
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- darray_exit(&trans->trans_kmalloc_trace);
-#endif
-
- unsigned long *paths_allocated = trans->paths_allocated;
- trans->paths_allocated = NULL;
- trans->paths = NULL;
-
- if (paths_allocated != trans->_paths_allocated)
- kvfree_rcu_mightsleep(paths_allocated);
-
- if (trans->used_mempool)
- mempool_free(trans->mem, &c->btree_trans_mem_pool);
- else
- kfree(trans->mem);
-
- /* Userspace doesn't have a real percpu implementation: */
- if (IS_ENABLED(__KERNEL__))
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
-
- if (trans) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
-
- mempool_free(trans, &c->btree_trans_pool);
- }
-}
-
-bool bch2_current_has_btree_trans(struct bch_fs *c)
-{
- seqmutex_lock(&c->btree_trans_lock);
- struct btree_trans *trans;
- bool ret = false;
- list_for_each_entry(trans, &c->btree_trans_list, list)
- if (trans->locking_wait.task == current &&
- trans->locked) {
- ret = true;
- break;
- }
- seqmutex_unlock(&c->btree_trans_lock);
- return ret;
-}
-
-static void __maybe_unused
-bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *b)
-{
- struct six_lock_count c = six_lock_counts(&b->lock);
- pid_t pid;
-
- scoped_guard(rcu) {
- struct task_struct *owner = READ_ONCE(b->lock.owner);
- pid = owner ? owner->pid : 0;
- }
-
- prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
- bch2_btree_id_to_text(out, b->btree_id);
- prt_printf(out, " l=%u:", b->level);
- bch2_bpos_to_text(out, btree_node_pos(b));
-
- prt_printf(out, "\t locks %u:%u:%u held by pid %u",
- c.n[0], c.n[1], c.n[2], pid);
-}
-
-void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
-{
- struct btree_bkey_cached_common *b;
- static char lock_types[] = { 'r', 'i', 'w' };
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
- unsigned l, idx;
-
- /* before rcu_read_lock(): */
- bch2_printbuf_make_room(out, 4096);
-
- if (!out->nr_tabstops) {
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 32);
- }
-
- prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
-
- /* trans->paths is rcu protected vs. freeing */
- guard(rcu)();
- out->atomic++;
-
- struct btree_path *paths = rcu_dereference(trans->paths);
- if (!paths)
- goto out;
-
- unsigned long *paths_allocated = trans_paths_allocated(paths);
-
- trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
- struct btree_path *path = paths + idx;
- if (!path->nodes_locked)
- continue;
-
- prt_printf(out, " path %u %c ",
- idx,
- path->cached ? 'c' : 'b');
- bch2_btree_id_to_text(out, path->btree_id);
- prt_printf(out, " l=%u:", path->level);
- bch2_bpos_to_text(out, path->pos);
- prt_newline(out);
-
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
- if (btree_node_locked(path, l) &&
- !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
- prt_printf(out, " %c l=%u ",
- lock_types[btree_node_locked_type(path, l)], l);
- bch2_btree_bkey_cached_common_to_text(out, b);
- prt_newline(out);
- }
- }
- }
-
- b = READ_ONCE(trans->locking);
- if (b) {
- prt_printf(out, " blocked for %lluus on\n",
- div_u64(local_clock() - trans->locking_wait.start_time, 1000));
- prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
- bch2_btree_bkey_cached_common_to_text(out, b);
- prt_newline(out);
- }
-out:
- --out->atomic;
-}
-
-void bch2_fs_btree_iter_exit(struct bch_fs *c)
-{
- struct btree_transaction_stats *s;
- struct btree_trans *trans;
- int cpu;
-
- if (c->btree_trans_bufs)
- for_each_possible_cpu(cpu) {
- struct btree_trans *trans =
- per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
-
- if (trans) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
- }
- kfree(trans);
- }
- free_percpu(c->btree_trans_bufs);
-
- trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
- if (trans)
- panic("%s leaked btree_trans\n", trans->fn);
-
- for (s = c->btree_transaction_stats;
- s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
- s++) {
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- darray_exit(&s->trans_kmalloc_trace);
-#endif
- kfree(s->max_paths_text);
- bch2_time_stats_exit(&s->lock_hold_times);
- }
-
- if (c->btree_trans_barrier_initialized) {
- synchronize_srcu_expedited(&c->btree_trans_barrier);
- cleanup_srcu_struct(&c->btree_trans_barrier);
- }
- mempool_exit(&c->btree_trans_mem_pool);
- mempool_exit(&c->btree_trans_pool);
-}
-
-void bch2_fs_btree_iter_init_early(struct bch_fs *c)
-{
- struct btree_transaction_stats *s;
-
- for (s = c->btree_transaction_stats;
- s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
- s++) {
- bch2_time_stats_init(&s->duration);
- bch2_time_stats_init(&s->lock_hold_times);
- mutex_init(&s->lock);
- }
-
- INIT_LIST_HEAD(&c->btree_trans_list);
- seqmutex_init(&c->btree_trans_lock);
-}
-
-int bch2_fs_btree_iter_init(struct bch_fs *c)
-{
- int ret;
-
- c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
- if (!c->btree_trans_bufs)
- return -ENOMEM;
-
- ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
- sizeof(struct btree_trans)) ?:
- mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
- BTREE_TRANS_MEM_MAX) ?:
- init_srcu_struct(&c->btree_trans_barrier);
- if (ret)
- return ret;
-
- /*
- * static annotation (hackily done) for lock ordering of reclaim vs.
- * btree node locks:
- */
-#ifdef CONFIG_LOCKDEP
- fs_reclaim_acquire(GFP_KERNEL);
- struct btree_trans *trans = bch2_trans_get(c);
- trans_set_locked(trans, false);
- bch2_trans_put(trans);
- fs_reclaim_release(GFP_KERNEL);
-#endif
-
- c->btree_trans_barrier_initialized = true;
- return 0;
-
-}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
deleted file mode 100644
index 09dd3e52622e..000000000000
--- a/fs/bcachefs/btree_iter.h
+++ /dev/null
@@ -1,1010 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_ITER_H
-#define _BCACHEFS_BTREE_ITER_H
-
-#include "bset.h"
-#include "btree_types.h"
-#include "trace.h"
-
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
-static inline int __bkey_err(const struct bkey *k)
-{
- return PTR_ERR_OR_ZERO(k);
-}
-
-#define bkey_err(_k) __bkey_err((_k).k)
-
-static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
- unsigned idx = path - trans->paths;
-
- EBUG_ON(idx >= trans->nr_paths);
- EBUG_ON(!test_bit(idx, trans->paths_allocated));
- if (unlikely(path->ref == U8_MAX)) {
- bch2_dump_trans_paths_updates(trans);
- panic("path %u refcount overflow\n", idx);
- }
-
- path->ref++;
- path->intent_ref += intent;
- trace_btree_path_get_ll(trans, path);
-}
-
-static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
-{
- EBUG_ON(path - trans->paths >= trans->nr_paths);
- EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
- EBUG_ON(!path->ref);
- EBUG_ON(!path->intent_ref && intent);
-
- trace_btree_path_put_ll(trans, path);
- path->intent_ref -= intent;
- return --path->ref == 0;
-}
-
-static inline void btree_path_set_dirty(struct btree_trans *trans,
- struct btree_path *path,
- enum btree_path_uptodate u)
-{
- BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
- path->uptodate = max_t(unsigned, path->uptodate, u);
-}
-
-static inline struct btree *btree_path_node(struct btree_path *path,
- unsigned level)
-{
- return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
-}
-
-static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
- const struct btree *b, unsigned level)
-{
- return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
-}
-
-static inline struct btree *btree_node_parent(struct btree_path *path,
- struct btree *b)
-{
- return btree_path_node(path, b->c.level + 1);
-}
-
-/* Iterate over paths within a transaction: */
-
-void __bch2_btree_trans_sort_paths(struct btree_trans *);
-
-static inline void btree_trans_sort_paths(struct btree_trans *trans)
-{
- if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- trans->paths_sorted)
- return;
- __bch2_btree_trans_sort_paths(trans);
-}
-
-static inline unsigned long *trans_paths_nr(struct btree_path *paths)
-{
- return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
-}
-
-static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
-{
- unsigned long *v = trans_paths_nr(paths);
- return v - BITS_TO_LONGS(*v);
-}
-
-#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
- for (_idx = _start; \
- (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \
- _idx++)
-
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned *idx)
-{
- unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
- /*
- * Open coded find_next_bit(), because
- * - this is fast path, we can't afford the function call
- * - and we know that nr_paths is a multiple of BITS_PER_LONG,
- */
- while (*idx < trans->nr_paths) {
- unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
- if (v) {
- *idx += __ffs(v);
- return trans->paths + *idx;
- }
-
- *idx += BITS_PER_LONG;
- *idx &= ~(BITS_PER_LONG - 1);
- w++;
- }
-
- return NULL;
-}
-
-/*
- * This version is intended to be safe for use on a btree_trans that is owned by
- * another thread, for bch2_btree_trans_to_text();
- */
-#define trans_for_each_path_from(_trans, _path, _idx, _start) \
- for (_idx = _start; \
- (_path = __trans_next_path((_trans), &_idx)); \
- _idx++)
-
-#define trans_for_each_path(_trans, _path, _idx) \
- trans_for_each_path_from(_trans, _path, _idx, 1)
-
-static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned idx = path ? path->sorted_idx + 1 : 0;
-
- EBUG_ON(idx > trans->nr_sorted);
-
- return idx < trans->nr_sorted
- ? trans->paths + trans->sorted[idx]
- : NULL;
-}
-
-static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
-
- return idx
- ? trans->paths + trans->sorted[idx - 1]
- : NULL;
-}
-
-#define trans_for_each_path_idx_inorder(_trans, _iter) \
- for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
- (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
- _iter.sorted_idx < (_trans)->nr_sorted); \
- _iter.sorted_idx++)
-
-struct trans_for_each_path_inorder_iter {
- btree_path_idx_t sorted_idx;
- btree_path_idx_t path_idx;
-};
-
-#define trans_for_each_path_inorder(_trans, _path, _iter) \
- for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
- (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
- _path = (_trans)->paths + _iter.path_idx, \
- _iter.sorted_idx < (_trans)->nr_sorted); \
- _iter.sorted_idx++)
-
-#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
- for (_i = trans->nr_sorted - 1; \
- ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
- --_i)
-
-static inline bool __path_has_node(const struct btree_path *path,
- const struct btree *b)
-{
- return path->l[b->c.level].b == b &&
- btree_node_lock_seq_matches(path, b, b->c.level);
-}
-
-static inline struct btree_path *
-__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
- unsigned *idx)
-{
- struct btree_path *path;
-
- while ((path = __trans_next_path(trans, idx)) &&
- !__path_has_node(path, b))
- (*idx)++;
-
- return path;
-}
-
-#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \
- for (_iter = 1; \
- (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
- _iter++)
-
-btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
- bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
- btree_path_idx_t path, bool intent,
- unsigned long ip)
-{
- if (trans->paths[path].ref > 1 ||
- trans->paths[path].preserve)
- path = __bch2_btree_path_make_mut(trans, path, intent, ip);
- trans->paths[path].should_be_locked = false;
- return path;
-}
-
-btree_path_idx_t __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
- struct bpos, bool, unsigned long);
-
-static inline btree_path_idx_t __must_check
-bch2_btree_path_set_pos(struct btree_trans *trans,
- btree_path_idx_t path, struct bpos new_pos,
- bool intent, unsigned long ip)
-{
- return !bpos_eq(new_pos, trans->paths[path].pos)
- ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
- : path;
-}
-
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
- btree_path_idx_t,
- unsigned, unsigned long);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
-
-static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
- btree_path_idx_t path, unsigned flags)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
- return 0;
-
- return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
-btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
- unsigned, unsigned, unsigned, unsigned long);
-btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id,
- unsigned, struct bpos);
-
-struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
- struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
- if (k.k && bpos_eq(path->pos, k.k->p))
- return k;
-
- bkey_init(u);
- u->p = path->pos;
- return (struct bkey_s_c) { u, NULL };
-}
-
-struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
- struct btree_iter *, struct bpos);
-
-void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
-
-int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
-
-static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
-{
- return mutex_trylock(lock)
- ? 0
- : __bch2_trans_mutex_lock(trans, lock);
-}
-
-/* Debug: */
-
-void __bch2_trans_verify_paths(struct btree_trans *);
-void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline void bch2_trans_verify_paths(struct btree_trans *trans)
-{
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- __bch2_trans_verify_paths(trans);
-}
-
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos)
-{
- if (static_branch_unlikely(&bch2_debug_check_iterators))
- __bch2_assert_pos_locked(trans, btree, pos);
-}
-
-void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
- struct btree *, struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
- struct btree *, struct btree_node_iter *,
- struct bkey_packed *, unsigned, unsigned);
-
-int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-
-void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
-
-int bch2_trans_relock(struct btree_trans *);
-int bch2_trans_relock_notrace(struct btree_trans *);
-void bch2_trans_unlock(struct btree_trans *);
-void bch2_trans_unlock_long(struct btree_trans *);
-
-static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
-{
- return restart_count != trans->restart_count
- ? -BCH_ERR_transaction_restart_nested
- : 0;
-}
-
-void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
-
-static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
- u32 restart_count)
-{
- if (trans_was_restarted(trans, restart_count))
- bch2_trans_restart_error(trans, restart_count);
-}
-
-void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
-
-static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
-{
- if (trans->restarted || !trans->locked)
- bch2_trans_unlocked_or_in_restart_error(trans);
-}
-
-__always_inline
-static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
-{
- BUG_ON(err <= 0);
- BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
-
- trans->restarted = err;
- trans->last_restarted_ip = ip;
- return -err;
-}
-
-__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
-{
- btree_trans_restart_foreign_task(trans, err, ip);
-#ifdef CONFIG_BCACHEFS_DEBUG
- darray_exit(&trans->last_restarted_trace);
- bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
-#endif
- return -err;
-}
-
-__always_inline
-static int btree_trans_restart(struct btree_trans *trans, int err)
-{
- return btree_trans_restart_ip(trans, err, _THIS_IP_);
-}
-
-static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
- if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
- trace_and_count(trans->c, trans_restart_injected, trans, ip);
- return btree_trans_restart_ip(trans,
- BCH_ERR_transaction_restart_fault_inject, ip);
- }
-#endif
- return 0;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
-
-void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned new_locks_want = path->level + !!path->intent_ref;
-
- if (path->locks_want > new_locks_want)
- __bch2_btree_path_downgrade(trans, path, new_locks_want);
-}
-
-void bch2_trans_downgrade(struct btree_trans *);
-
-void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
-void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
-void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-
-int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-
-struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
-
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
-}
-
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
-{
- return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
-}
-
-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
-
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
-
-bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
-
-static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
- iter->k.type = KEY_TYPE_deleted;
- iter->k.p.inode = iter->pos.inode = new_pos.inode;
- iter->k.p.offset = iter->pos.offset = new_pos.offset;
- iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
- iter->k.size = 0;
-}
-
-static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
- struct btree_iter *iter, struct bpos new_pos)
-{
- if (unlikely(iter->update_path))
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
-
- if (!(iter->flags & BTREE_ITER_all_snapshots))
- new_pos.snapshot = iter->snapshot;
-
- __bch2_btree_iter_set_pos(iter, new_pos);
-}
-
-static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
-{
- BUG_ON(!(iter->flags & BTREE_ITER_is_extents));
- iter->pos = bkey_start_pos(&iter->k);
-}
-
-static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
- struct btree_iter *iter, u32 snapshot)
-{
- struct bpos pos = iter->pos;
-
- iter->snapshot = snapshot;
- pos.snapshot = snapshot;
- bch2_btree_iter_set_pos(trans, iter, pos);
-}
-
-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-
-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
- unsigned btree_id,
- unsigned level,
- unsigned flags)
-{
- if (level || !btree_id_cached(trans->c, btree_id)) {
- flags &= ~BTREE_ITER_cached;
- flags &= ~BTREE_ITER_with_key_cache;
- } else if (!(flags & BTREE_ITER_cached))
- flags |= BTREE_ITER_with_key_cache;
-
- if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
- btree_id_is_extents(btree_id))
- flags |= BTREE_ITER_is_extents;
-
- if (!(flags & BTREE_ITER_snapshot_field) &&
- !btree_type_has_snapshot_field(btree_id))
- flags &= ~BTREE_ITER_all_snapshots;
-
- if (!(flags & BTREE_ITER_all_snapshots) &&
- btree_type_has_snapshots(btree_id))
- flags |= BTREE_ITER_filter_snapshots;
-
- if (trans->journal_replay_not_finished)
- flags |= BTREE_ITER_with_journal;
-
- return flags;
-}
-
-static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags,
- unsigned long ip)
-{
- iter->update_path = 0;
- iter->key_cache_path = 0;
- iter->btree_id = btree_id;
- iter->min_depth = 0;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k = POS_KEY(pos);
- iter->journal_idx = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- iter->ip_allocated = ip;
-#endif
- iter->path = bch2_path_get(trans, btree_id, iter->pos,
- locks_want, depth, flags, ip);
-}
-
-void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
- enum btree_id, struct bpos, unsigned);
-
-static inline void bch2_trans_iter_init(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned flags)
-{
- if (__builtin_constant_p(btree_id) &&
- __builtin_constant_p(flags))
- bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, 0, flags),
- _THIS_IP_);
- else
- bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
-}
-
-void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
- enum btree_id, struct bpos,
- unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
-
-void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
-
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
-void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
- darray_trans_kmalloc_trace *);
-#endif
-
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
-
-static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
- unsigned long ip)
-{
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- darray_push(&trans->trans_kmalloc_trace,
- ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
-#endif
-}
-
-static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size,
- unsigned long ip)
-{
- size = roundup(size, 8);
-
- bch2_trans_kmalloc_trace(trans, size, ip);
-
- if (likely(trans->mem_top + size <= trans->mem_bytes)) {
- void *p = trans->mem + trans->mem_top;
-
- trans->mem_top += size;
- return p;
- } else {
- return __bch2_trans_kmalloc(trans, size, ip);
- }
-}
-
-static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size,
- unsigned long ip)
-{
- size = roundup(size, 8);
-
- bch2_trans_kmalloc_trace(trans, size, ip);
-
- if (likely(trans->mem_top + size <= trans->mem_bytes)) {
- void *p = trans->mem + trans->mem_top;
-
- trans->mem_top += size;
- memset(p, 0, size);
- return p;
- } else {
- return __bch2_trans_kmalloc(trans, size, ip);
- }
-}
-
-/**
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
- *
- * Must be called after bch2_trans_begin, which on second and further calls
- * frees all memory allocated in this transaction
- */
-static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
-{
- return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_);
-}
-
-static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
-{
- return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_);
-}
-
-static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type)
-{
- struct bkey_s_c k;
-
- bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
- k = bch2_btree_iter_peek_slot(trans, iter);
-
- if (!bkey_err(k) && type && k.k->type != type)
- k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
- if (unlikely(bkey_err(k)))
- bch2_trans_iter_exit(trans, iter);
- return k;
-}
-
-static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned flags)
-{
- return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
-}
-
-#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
- bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
- _btree_id, _pos, _flags, KEY_TYPE_##_type))
-
-static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
-{
- unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
- memcpy(dst_v, src_k.v, b);
- if (unlikely(b < dst_size))
- memset(dst_v + b, 0, dst_size - b);
-}
-
-#define bkey_val_copy(_dst_v, _src_k) \
-do { \
- BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \
- __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \
-} while (0)
-
-static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
- unsigned btree_id, struct bpos pos,
- unsigned flags, unsigned type,
- unsigned val_size, void *val)
-{
- struct btree_iter iter;
- struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
- int ret = bkey_err(k);
- if (!ret) {
- __bkey_val_copy(val, val_size, k);
- bch2_trans_iter_exit(trans, &iter);
- }
-
- return ret;
-}
-
-#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
- __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \
- KEY_TYPE_##_type, sizeof(*_val), _val)
-
-void bch2_trans_srcu_unlock(struct btree_trans *);
-
-u32 bch2_trans_begin(struct btree_trans *);
-
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _locks_want, _depth, _flags, _b, _do) \
-({ \
- bch2_trans_begin((_trans)); \
- \
- struct btree_iter _iter; \
- bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \
- _start, _locks_want, _depth, _flags); \
- int _ret3 = 0; \
- do { \
- _ret3 = lockrestart_do((_trans), ({ \
- struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
- if (!_b) \
- break; \
- \
- PTR_ERR_OR_ZERO(_b) ?: (_do); \
- })) ?: \
- lockrestart_do((_trans), \
- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
- } while (!_ret3); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _flags, _b, _do) \
- __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- 0, 0, _flags, _b, _do)
-
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
-{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
- bch2_btree_iter_peek_prev(trans, iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
-{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
- bch2_btree_iter_peek(trans, iter);
-}
-
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
-{
- if (!(flags & BTREE_ITER_slots))
- return bch2_btree_iter_peek_max(trans, iter, end);
-
- if (bkey_gt(iter->pos, end))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_slot(trans, iter);
-}
-
-int __bch2_btree_trans_too_many_iters(struct btree_trans *);
-
-static inline int btree_trans_too_many_iters(struct btree_trans *trans)
-{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
- return __bch2_btree_trans_too_many_iters(trans);
-
- return 0;
-}
-
-/*
- * goto instead of loop, so that when used inside for_each_btree_key2()
- * break/continue work correctly
- */
-#define lockrestart_do(_trans, _do) \
-({ \
- __label__ transaction_restart; \
- u32 _restart_count; \
- int _ret2; \
-transaction_restart: \
- _restart_count = bch2_trans_begin(_trans); \
- _ret2 = (_do); \
- \
- if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \
- goto transaction_restart; \
- \
- if (!_ret2) \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- _ret2; \
-})
-
-/*
- * nested_lockrestart_do(), nested_commit_do():
- *
- * These are like lockrestart_do() and commit_do(), with two differences:
- *
- * - We don't call bch2_trans_begin() unless we had a transaction restart
- * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
- * transaction restart
- */
-#define nested_lockrestart_do(_trans, _do) \
-({ \
- u32 _restart_count, _orig_restart_count; \
- int _ret2; \
- \
- _restart_count = _orig_restart_count = (_trans)->restart_count; \
- \
- while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
- _restart_count = bch2_trans_begin(_trans); \
- \
- if (!_ret2) \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- \
- _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
-})
-
-#define for_each_btree_key_max_continue(_trans, _iter, \
- _end, _flags, _k, _do) \
-({ \
- struct bkey_s_c _k; \
- int _ret3 = 0; \
- \
- do { \
- _ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \
- _end, (_flags)); \
- if (!(_k).k) \
- break; \
- \
- bkey_err(_k) ?: (_do); \
- })); \
- } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
- for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_max(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
-({ \
- bch2_trans_begin(trans); \
- \
- struct btree_iter _iter; \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
-})
-
-#define for_each_btree_key(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
- for_each_btree_key_max(_trans, _iter, _btree_id, _start, \
- SPOS_MAX, _flags, _k, _do)
-
-#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
-({ \
- struct btree_iter _iter; \
- struct bkey_s_c _k; \
- int _ret3 = 0; \
- \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- do { \
- _ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \
- (_flags)); \
- if (!(_k).k) \
- break; \
- \
- bkey_err(_k) ?: (_do); \
- })); \
- } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_key_commit(_trans, _iter, _btree_id, \
- _start, _iter_flags, _k, \
- _disk_res, _journal_seq, _commit_flags,\
- _do) \
- for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
- (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \
- _start, _iter_flags, _k, \
- _disk_res, _journal_seq, _commit_flags,\
- _do) \
- for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
- (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_commit_flags)))
-
-#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \
- _start, _end, _iter_flags, _k, \
- _disk_res, _journal_seq, _commit_flags,\
- _do) \
- for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
- (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_commit_flags)))
-
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
- struct btree_iter *);
-
-#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(_trans, &(_iter)))
-
-#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
- for (; \
- (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(_trans, &(_iter)))
-
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
- SPOS_MAX, _flags, _k, _ret)
-
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_rewind(_trans, &(_iter)))
-
-#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \
- for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
-
-/*
- * This should not be used in a fastpath, without first trying _do in
- * nonblocking mode - it will cause excessive transaction restarts and
- * potentially livelocking:
- */
-#define drop_locks_do(_trans, _do) \
-({ \
- bch2_trans_unlock(_trans); \
- (_do) ?: bch2_trans_relock(_trans); \
-})
-
-#define allocate_dropping_locks_errcode(_trans, _do) \
-({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
- int _ret = _do; \
- \
- if (bch2_err_matches(_ret, ENOMEM)) { \
- _gfp = GFP_KERNEL; \
- _ret = drop_locks_do(_trans, _do); \
- } \
- _ret; \
-})
-
-#define allocate_dropping_locks(_trans, _ret, _do) \
-({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
- typeof(_do) _p = _do; \
- \
- _ret = 0; \
- if (unlikely(!_p)) { \
- _gfp = GFP_KERNEL; \
- _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
- } \
- _p; \
-})
-
-struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
-void bch2_trans_put(struct btree_trans *);
-
-bool bch2_current_has_btree_trans(struct bch_fs *);
-
-extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
-unsigned bch2_trans_get_fn_idx(const char *);
-
-#define bch2_trans_get(_c) \
-({ \
- static unsigned trans_fn_idx; \
- \
- if (unlikely(!trans_fn_idx)) \
- trans_fn_idx = bch2_trans_get_fn_idx(__func__); \
- __bch2_trans_get(_c, trans_fn_idx); \
-})
-
-/*
- * We don't use DEFINE_CLASS() because using a function for the constructor
- * breaks bch2_trans_get()'s use of __func__
- */
-typedef struct btree_trans * class_btree_trans_t;
-static inline void class_btree_trans_destructor(struct btree_trans **p)
-{
- struct btree_trans *trans = *p;
- bch2_trans_put(trans);
-}
-
-#define class_btree_trans_constructor(_c) bch2_trans_get(_c)
-
-#define bch2_trans_run(_c, _do) \
-({ \
- CLASS(btree_trans, trans)(_c); \
- (_do); \
-})
-
-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
-
-void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
-
-void bch2_fs_btree_iter_exit(struct bch_fs *);
-void bch2_fs_btree_iter_init_early(struct bch_fs *);
-int bch2_fs_btree_iter_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
deleted file mode 100644
index ea839560a136..000000000000
--- a/fs/bcachefs/btree_journal_iter.c
+++ /dev/null
@@ -1,830 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "journal_io.h"
-
-#include <linux/sort.h>
-
-/*
- * For managing keys we read from the journal: until journal replay works normal
- * btree lookups need to be able to find and return keys from the journal where
- * they overwrite what's in the btree, so we have a special iterator and
- * operations for the regular btree iter code to use:
- */
-
-static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
-{
- size_t gap_size = keys->size - keys->nr;
-
- BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
-
- if (pos >= keys->gap)
- pos -= gap_size;
- return pos;
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
- size_t gap_size = keys->size - keys->nr;
-
- if (idx >= keys->gap)
- idx += gap_size;
- return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
- return keys->data + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- size_t l = 0, r = keys->nr, m;
-
- while (l < r) {
- m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
- l = m + 1;
- else
- r = m;
- }
-
- BUG_ON(l < keys->nr &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
- BUG_ON(l &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
- return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
-{
- struct journal_keys *keys = &c->journal_keys;
- unsigned iters = 0;
- struct journal_key *k;
-
- BUG_ON(*idx > keys->nr);
-search:
- if (!*idx)
- *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
- while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
- --(*idx);
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- struct bkey_i *ret = NULL;
- rcu_read_lock(); /* for overwritten_ranges */
-
- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
- break;
-
- if (k->overwritten) {
- if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->end;
- else
- *idx += 1;
- continue;
- }
-
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
- ret = k->k;
- break;
- }
-
- (*idx)++;
- iters++;
- if (iters == 10) {
- *idx = 0;
- rcu_read_unlock();
- goto search;
- }
- }
-
- rcu_read_unlock();
- return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
-{
- struct journal_keys *keys = &c->journal_keys;
- unsigned iters = 0;
- struct journal_key *k;
-
- BUG_ON(*idx > keys->nr);
-
- if (!keys->nr)
- return NULL;
-search:
- if (!*idx)
- *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
- while (*idx < keys->nr &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
- (*idx)++;
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- if (*idx == keys->nr)
- --(*idx);
-
- struct bkey_i *ret = NULL;
- rcu_read_lock(); /* for overwritten_ranges */
-
- while (true) {
- k = idx_to_key(keys, *idx);
- if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
- break;
-
- if (k->overwritten) {
- if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->start;
- if (!*idx)
- break;
- --(*idx);
- continue;
- }
-
- if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
- ret = k->k;
- break;
- }
-
- if (!*idx)
- break;
- --(*idx);
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- rcu_read_unlock();
- return ret;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos)
-{
- size_t idx = 0;
-
- return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iter_verify(struct journal_iter *iter)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct journal_keys *keys = iter->keys;
- size_t gap_size = keys->size - keys->nr;
-
- BUG_ON(iter->idx >= keys->gap &&
- iter->idx < keys->gap + gap_size);
-
- if (iter->idx < keys->size) {
- struct journal_key *k = keys->data + iter->idx;
-
- int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
- BUG_ON(cmp > 0);
- }
-#endif
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- /* The key we just inserted is immediately before the gap: */
- size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct journal_key *new_key = &keys->data[keys->gap - 1];
- struct journal_iter *iter;
-
- /*
- * If an iterator points one after the key we just inserted, decrement
- * the iterator so it points at the key we just inserted - if the
- * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
- * handle that:
- */
- list_for_each_entry(iter, &c->journal_iters, list) {
- journal_iter_verify(iter);
- if (iter->idx == gap_end &&
- new_key->btree_id == iter->btree_id &&
- new_key->level == iter->level)
- iter->idx = keys->gap - 1;
- journal_iter_verify(iter);
- }
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
- struct journal_keys *keys = &c->journal_keys;
- struct journal_iter *iter;
- size_t gap_size = keys->size - keys->nr;
-
- list_for_each_entry(iter, &c->journal_iters, list) {
- if (iter->idx > old_gap)
- iter->idx -= gap_size;
- if (iter->idx >= new_gap)
- iter->idx += gap_size;
- }
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
-{
- struct journal_key n = {
- .btree_id = id,
- .level = level,
- .k = k,
- .allocated = true,
- /*
- * Ensure these keys are done last by journal replay, to unblock
- * journal reclaim:
- */
- .journal_seq = U64_MAX,
- };
- struct journal_keys *keys = &c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
- BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
- if (idx < keys->size &&
- journal_key_cmp(&n, &keys->data[idx]) == 0) {
- if (keys->data[idx].allocated)
- kfree(keys->data[idx].k);
- keys->data[idx] = n;
- return 0;
- }
-
- if (idx > keys->gap)
- idx -= keys->size - keys->nr;
-
- size_t old_gap = keys->gap;
-
- if (keys->nr == keys->size) {
- journal_iters_move_gap(c, old_gap, keys->size);
- old_gap = keys->size;
-
- struct journal_keys new_keys = {
- .nr = keys->nr,
- .size = max_t(size_t, keys->size, 8) * 2,
- };
-
- new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL);
- if (!new_keys.data) {
- bch_err(c, "%s: error allocating new key array (size %zu)",
- __func__, new_keys.size);
- return bch_err_throw(c, ENOMEM_journal_key_insert);
- }
-
- /* Since @keys was full, there was no gap: */
- memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
- kvfree(keys->data);
- keys->data = new_keys.data;
- keys->nr = new_keys.nr;
- keys->size = new_keys.size;
-
- /* And now the gap is at the end: */
- keys->gap = keys->nr;
- }
-
- journal_iters_move_gap(c, old_gap, idx);
-
- move_gap(keys, idx);
-
- keys->nr++;
- keys->data[keys->gap++] = n;
-
- journal_iters_fix(c);
-
- return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
-{
- struct bkey_i *n;
- int ret;
-
- n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
- if (!n)
- return bch_err_throw(c, ENOMEM_journal_key_insert);
-
- bkey_copy(n, k);
- ret = bch2_journal_key_insert_take(c, id, level, n);
- if (ret)
- kfree(n);
- return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bpos pos)
-{
- struct bkey_i whiteout;
-
- bkey_init(&whiteout.k);
- whiteout.k.p = pos;
-
- return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
- unsigned level, struct bpos pos)
-{
- struct journal_keys *keys = &trans->c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
- if (!trans->journal_replay_not_finished)
- return false;
-
- return (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- bkey_deleted(&keys->data[idx].k->k));
-}
-
-static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
-{
- struct journal_key *k = keys->data + pos;
- size_t idx = pos_to_idx(keys, pos);
-
- k->overwritten = true;
-
- struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
- struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
-
- bool prev_overwritten = prev && prev->overwritten;
- bool next_overwritten = next && next->overwritten;
-
- struct journal_key_range_overwritten *prev_range =
- prev_overwritten ? prev->overwritten_range : NULL;
- struct journal_key_range_overwritten *next_range =
- next_overwritten ? next->overwritten_range : NULL;
-
- BUG_ON(prev_range && prev_range->end != idx);
- BUG_ON(next_range && next_range->start != idx + 1);
-
- if (prev_range && next_range) {
- prev_range->end = next_range->end;
-
- keys->data[pos].overwritten_range = prev_range;
- for (size_t i = next_range->start; i < next_range->end; i++) {
- struct journal_key *ip = keys->data + idx_to_pos(keys, i);
- BUG_ON(ip->overwritten_range != next_range);
- ip->overwritten_range = prev_range;
- }
-
- kfree_rcu_mightsleep(next_range);
- } else if (prev_range) {
- prev_range->end++;
- k->overwritten_range = prev_range;
- if (next_overwritten) {
- prev_range->end++;
- next->overwritten_range = prev_range;
- }
- } else if (next_range) {
- next_range->start--;
- k->overwritten_range = next_range;
- if (prev_overwritten) {
- next_range->start--;
- prev->overwritten_range = next_range;
- }
- } else if (prev_overwritten || next_overwritten) {
- struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- return;
-
- r->start = idx - (size_t) prev_overwritten;
- r->end = idx + 1 + (size_t) next_overwritten;
-
- rcu_assign_pointer(k->overwritten_range, r);
- if (prev_overwritten)
- prev->overwritten_range = r;
- if (next_overwritten)
- next->overwritten_range = r;
- }
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
- unsigned level, struct bpos pos)
-{
- struct journal_keys *keys = &c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
- if (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- !keys->data[idx].overwritten) {
- mutex_lock(&keys->overwrite_lock);
- __bch2_journal_key_overwritten(keys, idx);
- mutex_unlock(&keys->overwrite_lock);
- }
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
- if (iter->idx < iter->keys->size) {
- iter->idx++;
- if (iter->idx == iter->keys->gap)
- iter->idx += iter->keys->size - iter->keys->nr;
- }
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
- journal_iter_verify(iter);
-
- guard(rcu)();
- while (iter->idx < iter->keys->size) {
- struct journal_key *k = iter->keys->data + iter->idx;
-
- int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
- if (cmp < 0)
- break;
- BUG_ON(cmp);
-
- if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
-
- if (k->overwritten_range)
- iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
- else
- bch2_journal_iter_advance(iter);
- }
-
- return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
- list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
- struct journal_iter *iter,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- iter->btree_id = id;
- iter->level = level;
- iter->keys = &c->journal_keys;
- iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
-
- journal_iter_verify(iter);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
- return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
- iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
- bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
- if (bpos_eq(iter->pos, SPOS_MAX))
- iter->at_end = true;
- else
- iter->pos = bpos_successor(iter->pos);
-}
-
-static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
-{
- struct btree_and_journal_iter iter = *_iter;
- struct bch_fs *c = iter.trans->c;
- unsigned level = iter.journal.level;
- struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_started, &c->flags)
- ? (level > 1 ? 0 : 2)
- : (level > 1 ? 1 : 16);
-
- iter.prefetch = false;
- iter.fail_if_too_many_whiteouts = true;
- bch2_bkey_buf_init(&tmp);
-
- while (nr--) {
- bch2_btree_and_journal_iter_advance(&iter);
- struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
- if (!k.k)
- break;
-
- bch2_bkey_buf_reassemble(&tmp, c, k);
- bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
- struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
- size_t iters = 0;
-
- if (iter->prefetch && iter->journal.level)
- btree_and_journal_iter_prefetch(iter);
-again:
- if (iter->at_end)
- return bkey_s_c_null;
-
- iters++;
-
- if (iters > 20 && iter->fail_if_too_many_whiteouts)
- return bkey_s_c_null;
-
- while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
- bpos_lt(btree_k.k->p, iter->pos))
- bch2_journal_iter_advance_btree(iter);
-
- if (iter->trans->journal_replay_not_finished)
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
-
- ret = journal_k.k &&
- (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
- ? journal_k
- : btree_k;
-
- if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
- ret = bkey_s_c_null;
-
- if (ret.k) {
- iter->pos = ret.k->p;
- if (bkey_deleted(ret.k)) {
- bch2_btree_and_journal_iter_advance(iter);
- goto again;
- }
- } else {
- iter->pos = SPOS_MAX;
- iter->at_end = true;
- }
-
- return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
- bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
- struct btree_and_journal_iter *iter,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct bpos pos)
-{
- memset(iter, 0, sizeof(*iter));
-
- iter->trans = trans;
- iter->b = b;
- iter->node_iter = node_iter;
- iter->pos = b->data->min_key;
- iter->at_end = false;
- INIT_LIST_HEAD(&iter->journal.list);
-
- if (trans->journal_replay_not_finished) {
- bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
- if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
- list_add(&iter->journal.list, &trans->c->journal_iters);
- }
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
- struct btree_and_journal_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter node_iter;
-
- bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
-}
-
-/* sort and dedup all keys in the journal: */
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
- const struct journal_key *l = _l;
- const struct journal_key *r = _r;
- int rewind = l->rewind && r->rewind ? -1 : 1;
-
- return journal_key_cmp(l, r) ?:
- ((cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset)) * rewind);
-}
-
-void bch2_journal_keys_put(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- BUG_ON(atomic_read(&keys->ref) <= 0);
-
- if (!atomic_dec_and_test(&keys->ref))
- return;
-
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- if (i->overwritten_range &&
- (i == &darray_last(*keys) ||
- i->overwritten_range != i[1].overwritten_range))
- kfree(i->overwritten_range);
-
- if (i->allocated)
- kfree(i->k);
- }
-
- kvfree(keys->data);
- keys->data = NULL;
- keys->nr = keys->gap = keys->size = 0;
-
- struct journal_replay **i;
- struct genradix_iter iter;
-
- genradix_for_each(&c->journal_entries, iter, i)
- kvfree(*i);
- genradix_free(&c->journal_entries);
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
- sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
- journal_sort_key_cmp, NULL);
-
- cond_resched();
-
- struct journal_key *dst = keys->data;
-
- darray_for_each(*keys, src) {
- /*
- * We don't accumulate accounting keys here because we have to
- * compare each individual accounting key against the version in
- * the btree during replay:
- */
- if (src->k->k.type != KEY_TYPE_accounting &&
- src + 1 < &darray_top(*keys) &&
- !journal_key_cmp(src, src + 1))
- continue;
-
- *dst++ = *src;
- }
-
- keys->nr = dst - keys->data;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *c)
-{
- struct genradix_iter iter;
- struct journal_replay *i, **_i;
- struct journal_keys *keys = &c->journal_keys;
- size_t nr_read = 0;
-
- u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- cond_resched();
-
- vstruct_for_each(&i->j, entry) {
- bool rewind = !entry->level &&
- !btree_id_is_alloc(entry->btree_id) &&
- le64_to_cpu(i->j.seq) >= rewind_seq;
-
- if (entry->type != (rewind
- ? BCH_JSET_ENTRY_overwrite
- : BCH_JSET_ENTRY_btree_keys))
- continue;
-
- if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start)
- continue;
-
- jset_entry_for_each_key(entry, k) {
- struct journal_key n = (struct journal_key) {
- .btree_id = entry->btree_id,
- .level = entry->level,
- .rewind = rewind,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
- .journal_offset = k->_data - i->j._data,
- };
-
- if (darray_push(keys, n)) {
- __journal_keys_sort(keys);
-
- if (keys->nr * 8 > keys->size * 7) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
- keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
- return bch_err_throw(c, ENOMEM_journal_keys_sort);
- }
-
- BUG_ON(darray_push(keys, n));
- }
-
- nr_read++;
- }
- }
- }
-
- __journal_keys_sort(keys);
- keys->gap = keys->nr;
-
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
- return 0;
-}
-
-void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
- unsigned level_min, unsigned level_max,
- struct bpos start, struct bpos end)
-{
- struct journal_keys *keys = &c->journal_keys;
- size_t dst = 0;
-
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i)
- if (!(i->btree_id == btree &&
- i->level >= level_min &&
- i->level <= level_max &&
- bpos_ge(i->k->k.p, start) &&
- bpos_le(i->k->k.p, end)))
- keys->data[dst++] = *i;
- keys->nr = keys->gap = dst;
-}
-
-void bch2_journal_keys_dump(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- struct printbuf buf = PRINTBUF;
-
- pr_info("%zu keys:", keys->nr);
-
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- printbuf_reset(&buf);
- prt_printf(&buf, "btree=");
- bch2_btree_id_to_text(&buf, i->btree_id);
- prt_printf(&buf, " l=%u ", i->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
- pr_err("%s", buf.buf);
- }
- printbuf_exit(&buf);
-}
-
-void bch2_fs_journal_keys_init(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- atomic_set(&keys->ref, 1);
- keys->initial_ref_held = true;
- mutex_init(&keys->overwrite_lock);
-}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
deleted file mode 100644
index 2a3082919b8d..000000000000
--- a/fs/bcachefs/btree_journal_iter.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_H
-
-#include "bkey.h"
-
-struct journal_iter {
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- size_t idx;
- struct journal_keys *keys;
-};
-
-/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
- */
-
-struct btree_and_journal_iter {
- struct btree_trans *trans;
- struct btree *b;
- struct btree_node_iter node_iter;
- struct bkey unpacked;
-
- struct journal_iter journal;
- struct bpos pos;
- bool at_end;
- bool prefetch;
- bool fail_if_too_many_whiteouts;
-};
-
-static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- const struct journal_key *r)
-{
- return -cmp_int(l_level, r->level) ?:
- cmp_int(l_btree_id, r->btree_id);
-}
-
-static inline int __journal_key_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- struct bpos l_pos,
- const struct journal_key *r)
-{
- return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
- bpos_cmp(l_pos, r->k->k.p);
-}
-
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
- unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
- unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-
-int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
- struct btree_and_journal_iter *);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
- unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
- unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
- struct btree_and_journal_iter *, struct btree *,
- struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
- struct btree_and_journal_iter *, struct btree *);
-
-void bch2_journal_keys_put(struct bch_fs *);
-
-static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
-{
- if (c->journal_keys.initial_ref_held)
- bch2_journal_keys_put(c);
- c->journal_keys.initial_ref_held = false;
-}
-
-int bch2_journal_keys_sort(struct bch_fs *);
-
-void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
- unsigned, unsigned,
- struct bpos, struct bpos);
-
-void bch2_journal_keys_dump(struct bch_fs *);
-
-void bch2_fs_journal_keys_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
deleted file mode 100644
index 86aacb254fb2..000000000000
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
-
-struct journal_key_range_overwritten {
- size_t start, end;
-};
-
-struct journal_key {
- u64 journal_seq;
- u32 journal_offset;
- enum btree_id btree_id:8;
- unsigned level:8;
- bool allocated:1;
- bool overwritten:1;
- bool rewind:1;
- struct journal_key_range_overwritten __rcu *
- overwritten_range;
- struct bkey_i *k;
-};
-
-struct journal_keys {
- /* must match layout in darray_types.h */
- size_t nr, size;
- struct journal_key *data;
- /*
- * Gap buffer: instead of all the empty space in the array being at the
- * end of the buffer - from @nr to @size - the empty space is at @gap.
- * This means that sequential insertions are O(n) instead of O(n^2).
- */
- size_t gap;
- atomic_t ref;
- bool initial_ref_held;
- struct mutex overwrite_lock;
-};
-
-#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
deleted file mode 100644
index d96188b92db2..000000000000
--- a/fs/bcachefs/btree_key_cache.c
+++ /dev/null
@@ -1,880 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "trace.h"
-
-#include <linux/sched/mm.h>
-
-static inline bool btree_uses_pcpu_readers(enum btree_id id)
-{
- return id == BTREE_ID_subvolumes;
-}
-
-static struct kmem_cache *bch2_key_cache;
-
-static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct bkey_cached *ck = obj;
- const struct bkey_cached_key *key = arg->key;
-
- return ck->key.btree_id != key->btree_id ||
- !bpos_eq(ck->key.pos, key->pos);
-}
-
-static const struct rhashtable_params bch2_btree_key_cache_params = {
- .head_offset = offsetof(struct bkey_cached, hash),
- .key_offset = offsetof(struct bkey_cached, key),
- .key_len = sizeof(struct bkey_cached_key),
- .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
- .automatic_shrinking = true,
-};
-
-static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
- struct bkey_cached *ck,
- enum btree_node_locked_type lock_held)
-{
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- path->l[0].b = (void *) ck;
- mark_btree_node_locked(trans, path, 0, lock_held);
-}
-
-__flatten
-inline struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
-{
- struct bkey_cached_key key = {
- .btree_id = btree_id,
- .pos = pos,
- };
-
- return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
- bch2_btree_key_cache_params);
-}
-
-static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
-{
- if (!six_trylock_intent(&ck->c.lock))
- return false;
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- six_unlock_intent(&ck->c.lock);
- return false;
- }
-
- if (!six_trylock_write(&ck->c.lock)) {
- six_unlock_intent(&ck->c.lock);
- return false;
- }
-
- return true;
-}
-
-static bool bkey_cached_evict(struct btree_key_cache *c,
- struct bkey_cached *ck)
-{
- bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
- bch2_btree_key_cache_params);
- if (ret) {
- memset(&ck->key, ~0, sizeof(ck->key));
- atomic_long_dec(&c->nr_keys);
- }
-
- return ret;
-}
-
-static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
-{
- struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
- struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
-
- this_cpu_dec(*c->btree_key_cache.nr_pending);
- kmem_cache_free(bch2_key_cache, ck);
-}
-
-static inline void bkey_cached_free_noassert(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- kfree(ck->k);
- ck->k = NULL;
- ck->u64s = 0;
-
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
-
- bool pcpu_readers = ck->c.lock.readers != NULL;
- rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
- this_cpu_inc(*bc->nr_pending);
-}
-
-static void bkey_cached_free(struct btree_trans *trans,
- struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- /*
- * we'll hit strange issues in the SRCU code if we aren't holding an
- * SRCU read lock...
- */
- EBUG_ON(!trans->srcu_held);
-
- bkey_cached_free_noassert(bc, ck);
-}
-
-static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
-{
- gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
-
- struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
- if (unlikely(!ck))
- return NULL;
- ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
- if (unlikely(!ck->k)) {
- kmem_cache_free(bch2_key_cache, ck);
- return NULL;
- }
- ck->u64s = key_u64s;
- return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
-{
- struct bch_fs *c = trans->c;
- struct btree_key_cache *bc = &c->btree_key_cache;
- bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
- int ret;
-
- struct bkey_cached *ck = container_of_or_null(
- rcu_pending_dequeue(&bc->pending[pcpu_readers]),
- struct bkey_cached, rcu);
- if (ck)
- goto lock;
-
- ck = allocate_dropping_locks(trans, ret,
- __bkey_cached_alloc(key_u64s, _gfp));
- if (ret) {
- if (ck)
- kfree(ck->k);
- kmem_cache_free(bch2_key_cache, ck);
- return ERR_PTR(ret);
- }
-
- if (ck) {
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
- ck->c.cached = true;
- goto lock;
- }
-
- ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
- struct bkey_cached, rcu);
- if (ck)
- goto lock;
-lock:
- six_lock_intent(&ck->c.lock, NULL, NULL);
- six_lock_write(&ck->c.lock, NULL, NULL);
- return ck;
-}
-
-static struct bkey_cached *
-bkey_cached_reuse(struct btree_key_cache *c)
-{
-
- guard(rcu)();
- struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
- struct rhash_head *pos;
- struct bkey_cached *ck;
-
- for (unsigned i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bkey_cached_lock_for_evict(ck)) {
- if (bkey_cached_evict(c, ck))
- return ck;
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- }
- }
- return NULL;
-}
-
-static int btree_key_cache_create(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path *ck_path,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct btree_key_cache *bc = &c->btree_key_cache;
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at
- * most 7 bytes (it won't be used):
- */
- unsigned key_u64s = k.k->u64s + 1;
-
- /*
- * Allocate some extra space so that the transaction commit path is less
- * likely to have to reallocate, since that requires a transaction
- * restart:
- */
- key_u64s = min(256U, (key_u64s * 3) / 2);
- key_u64s = roundup_pow_of_two(key_u64s);
-
- struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
- int ret = PTR_ERR_OR_ZERO(ck);
- if (ret)
- return ret;
-
- if (unlikely(!ck)) {
- ck = bkey_cached_reuse(bc);
- if (unlikely(!ck)) {
- bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_id_str(ck_path->btree_id));
- return bch_err_throw(c, ENOMEM_btree_key_cache_create);
- }
- }
-
- ck->c.level = 0;
- ck->c.btree_id = ck_path->btree_id;
- ck->key.btree_id = ck_path->btree_id;
- ck->key.pos = ck_path->pos;
- ck->flags = 1U << BKEY_CACHED_ACCESSED;
-
- if (unlikely(key_u64s > ck->u64s)) {
- mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
- struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
- kmalloc(key_u64s * sizeof(u64), _gfp));
- if (unlikely(!new_k)) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(ck->key.btree_id), key_u64s);
- ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
- } else if (ret) {
- kfree(new_k);
- goto err;
- }
-
- kfree(ck->k);
- ck->k = new_k;
- ck->u64s = key_u64s;
- }
-
- bkey_reassemble(ck->k, k);
-
- ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
- if (unlikely(ret))
- goto err;
-
- ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
-
- bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
-
- if (unlikely(ret)) /* raced with another fill? */
- goto err;
-
- atomic_long_inc(&bc->nr_keys);
- six_unlock_write(&ck->c.lock);
-
- enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
- if (lock_want == SIX_LOCK_read)
- six_lock_downgrade(&ck->c.lock);
- btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
- ck_path->uptodate = BTREE_ITER_UPTODATE;
- return 0;
-err:
- bkey_cached_free(trans, bc, ck);
- mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
-
- return ret;
-}
-
-static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
- struct bkey_s_c k)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, ck_path->pos);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, trans->c, k);
- trace_key_cache_fill(trans, buf.buf);
- printbuf_exit(&buf);
-}
-
-static noinline int btree_key_cache_fill(struct btree_trans *trans,
- btree_path_idx_t ck_path_idx,
- unsigned flags)
-{
- struct btree_path *ck_path = trans->paths + ck_path_idx;
-
- if (flags & BTREE_ITER_cached_nofill) {
- ck_path->l[0].b = NULL;
- return 0;
- }
-
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
- BTREE_ITER_intent|
- BTREE_ITER_key_cache_fill|
- BTREE_ITER_cached_nofill);
- iter.flags &= ~BTREE_ITER_with_journal;
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- /* Recheck after btree lookup, before allocating: */
- ck_path = trans->paths + ck_path_idx;
- ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
- if (unlikely(ret))
- goto out;
-
- ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
- if (ret)
- goto err;
-
- if (trace_key_cache_fill_enabled())
- do_trace_key_cache_fill(trans, ck_path, k);
-out:
- /* We're not likely to need this iterator again: */
- bch2_set_btree_iter_dontneed(trans, &iter);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
- btree_path_idx_t path_idx)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck;
- struct btree_path *path = trans->paths + path_idx;
-retry:
- ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck)
- return -ENOENT;
-
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
- if (ret)
- return ret;
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
- }
-
- if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
- set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
- path->uptodate = BTREE_ITER_UPTODATE;
- return 0;
-}
-
-int bch2_btree_path_traverse_cached(struct btree_trans *trans,
- btree_path_idx_t path_idx,
- unsigned flags)
-{
- EBUG_ON(trans->paths[path_idx].level);
-
- int ret;
- do {
- ret = btree_path_traverse_cached_fast(trans, path_idx);
- if (unlikely(ret == -ENOENT))
- ret = btree_key_cache_fill(trans, path_idx, flags);
- } while (ret == -EEXIST);
-
- struct btree_path *path = trans->paths + path_idx;
-
- if (unlikely(ret)) {
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(ret);
- }
- } else {
- BUG_ON(path->uptodate);
- BUG_ON(!path->nodes_locked);
- }
-
- return ret;
-}
-
-static int btree_key_cache_flush_pos(struct btree_trans *trans,
- struct bkey_cached_key key,
- u64 journal_seq,
- unsigned commit_flags,
- bool evict)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct btree_iter c_iter, b_iter;
- struct bkey_cached *ck = NULL;
- int ret;
-
- bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
- BTREE_ITER_slots|
- BTREE_ITER_intent|
- BTREE_ITER_all_snapshots);
- bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- b_iter.flags &= ~BTREE_ITER_with_key_cache;
-
- ret = bch2_btree_iter_traverse(trans, &c_iter);
- if (ret)
- goto out;
-
- ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
- if (!ck)
- goto out;
-
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- if (evict)
- goto evict;
- goto out;
- }
-
- if (journal_seq && ck->journal.seq != journal_seq)
- goto out;
-
- trans->journal_res.seq = ck->journal.seq;
-
- /*
- * If we're at the end of the journal, we really want to free up space
- * in the journal right away - we don't want to pin that old journal
- * sequence number with a new btree node write, we want to re-journal
- * the update
- */
- if (ck->journal.seq == journal_last_seq(j))
- commit_flags |= BCH_WATERMARK_reclaim;
-
- if (ck->journal.seq != journal_last_seq(j) ||
- !test_bit(JOURNAL_space_low, &c->journal.flags))
- commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
-
- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
- ret = bkey_err(btree_k);
- if (ret)
- goto err;
-
- /* * Check that we're not violating cache coherency rules: */
- BUG_ON(bkey_deleted(btree_k.k));
-
- ret = bch2_trans_update(trans, &b_iter, ck->k,
- BTREE_UPDATE_key_cache_reclaim|
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- commit_flags);
-err:
- bch2_fs_fatal_err_on(ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
- !bch2_journal_error(j), c,
- "flushing key cache: %s", bch2_err_str(ret));
- if (ret)
- goto out;
-
- bch2_journal_pin_drop(j, &ck->journal);
-
- struct btree_path *path = btree_iter_path(trans, &c_iter);
- BUG_ON(!btree_node_locked(path, 0));
-
- if (!evict) {
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_dec(&c->btree_key_cache.nr_dirty);
- }
- } else {
- struct btree_path *path2;
- unsigned i;
-evict:
- trans_for_each_path(trans, path2, i)
- if (path2 != path)
- __bch2_btree_path_unlock(trans, path2);
-
- bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_dec(&c->btree_key_cache.nr_dirty);
- }
-
- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
- if (bkey_cached_evict(&c->btree_key_cache, ck)) {
- bkey_cached_free(trans, &c->btree_key_cache, ck);
- } else {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- }
- }
-out:
- bch2_trans_iter_exit(trans, &b_iter);
- bch2_trans_iter_exit(trans, &c_iter);
- return ret;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bkey_cached *ck =
- container_of(pin, struct bkey_cached, journal);
- struct bkey_cached_key key;
- struct btree_trans *trans = bch2_trans_get(c);
- int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- int ret = 0;
-
- btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
- key = ck->key;
-
- if (ck->journal.seq != seq ||
- !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- six_unlock_read(&ck->c.lock);
- goto unlock;
- }
-
- if (ck->seq != seq) {
- bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
- bch2_btree_key_cache_journal_flush);
- six_unlock_read(&ck->c.lock);
- goto unlock;
- }
- six_unlock_read(&ck->c.lock);
-
- ret = lockrestart_do(trans,
- btree_key_cache_flush_pos(trans, key, seq,
- BCH_TRANS_COMMIT_journal_reclaim, false));
-unlock:
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
- bch2_trans_put(trans);
- return ret;
-}
-
-bool bch2_btree_insert_key_cached(struct btree_trans *trans,
- unsigned flags,
- struct btree_insert_entry *insert_entry)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
- struct bkey_i *insert = insert_entry->k;
- bool kick_reclaim = false;
-
- BUG_ON(insert->k.u64s > ck->u64s);
-
- bkey_copy(ck->k, insert);
-
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- set_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_inc(&c->btree_key_cache.nr_dirty);
-
- if (bch2_nr_btree_keys_need_flush(c))
- kick_reclaim = true;
- }
-
- /*
- * To minimize lock contention, we only add the journal pin here and
- * defer pin updates to the flush callback via ->seq. Be careful not to
- * update ->seq on nojournal commits because we don't want to update the
- * pin to a seq that doesn't include journal updates on disk. Otherwise
- * we risk losing the update after a crash.
- *
- * The only exception is if the pin is not active in the first place. We
- * have to add the pin because journal reclaim drives key cache
- * flushing. The flush callback will not proceed unless ->seq matches
- * the latest pin, so make sure it starts with a consistent value.
- */
- if (!(insert_entry->flags & BTREE_UPDATE_nojournal) ||
- !journal_pin_active(&ck->journal)) {
- ck->seq = trans->journal_res.seq;
- }
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- &ck->journal, bch2_btree_key_cache_journal_flush);
-
- if (kick_reclaim)
- journal_reclaim_kick(&c->journal);
- return true;
-}
-
-void bch2_btree_key_cache_drop(struct btree_trans *trans,
- struct btree_path *path)
-{
- struct bch_fs *c = trans->c;
- struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck = (void *) path->l[0].b;
-
- /*
- * We just did an update to the btree, bypassing the key cache: the key
- * cache key is now stale and must be dropped, even if dirty:
- */
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
- atomic_long_dec(&c->btree_key_cache.nr_dirty);
- bch2_journal_pin_drop(&c->journal, &ck->journal);
- }
-
- bkey_cached_evict(bc, ck);
- bkey_cached_free(trans, bc, ck);
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
-
- struct btree_path *path2;
- unsigned i;
- trans_for_each_path(trans, path2, i)
- if (path2->l[0].b == (void *) ck) {
- /*
- * It's safe to clear should_be_locked here because
- * we're evicting from the key cache, and we still have
- * the underlying btree locked: filling into the key
- * cache would require taking a write lock on the btree
- * node
- */
- path2->should_be_locked = false;
- __bch2_btree_path_unlock(trans, path2);
- path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
- btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE);
- }
-
- bch2_trans_verify_locks(trans);
-}
-
-static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct bch_fs *c = shrink->private_data;
- struct btree_key_cache *bc = &c->btree_key_cache;
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned iter, start;
- int srcu_idx;
-
- srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- rcu_read_lock();
-
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-
- /*
- * Scanning is expensive while a rehash is in progress - most elements
- * will be on the new hashtable, if it's in progress
- *
- * A rehash could still start while we're scanning - that's ok, we'll
- * still see most elements.
- */
- if (unlikely(tbl->nest)) {
- rcu_read_unlock();
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- return SHRINK_STOP;
- }
-
- iter = bc->shrink_iter;
- if (iter >= tbl->size)
- iter = 0;
- start = iter;
-
- do {
- struct rhash_head *pos, *next;
-
- pos = rht_ptr_rcu(&tbl->buckets[iter]);
-
- while (!rht_is_a_nulls(pos)) {
- next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
- ck = container_of(pos, struct bkey_cached, hash);
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- bc->skipped_dirty++;
- } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
- clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- bc->skipped_accessed++;
- } else if (!bkey_cached_lock_for_evict(ck)) {
- bc->skipped_lock_fail++;
- } else if (bkey_cached_evict(bc, ck)) {
- bkey_cached_free_noassert(bc, ck);
- bc->freed++;
- freed++;
- } else {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- }
-
- scanned++;
- if (scanned >= nr)
- goto out;
-
- pos = next;
- }
-
- iter++;
- if (iter >= tbl->size)
- iter = 0;
- } while (scanned < nr && iter != start);
-out:
- bc->shrink_iter = iter;
-
- rcu_read_unlock();
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
- return freed;
-}
-
-static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct bch_fs *c = shrink->private_data;
- struct btree_key_cache *bc = &c->btree_key_cache;
- long nr = atomic_long_read(&bc->nr_keys) -
- atomic_long_read(&bc->nr_dirty);
-
- /*
- * Avoid hammering our shrinker too much if it's nearly empty - the
- * shrinker code doesn't take into account how big our cache is, if it's
- * mostly empty but the system is under memory pressure it causes nasty
- * lock contention:
- */
- nr -= 128;
-
- return max(0L, nr);
-}
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- struct rhash_head *pos;
- LIST_HEAD(items);
- unsigned i;
-
- shrinker_free(bc->shrink);
-
- /*
- * The loop is needed to guard against racing with rehash:
- */
- while (atomic_long_read(&bc->nr_keys)) {
- rcu_read_lock();
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl) {
- if (tbl->nest) {
- /* wait for in progress rehash */
- rcu_read_unlock();
- mutex_lock(&bc->table.mutex);
- mutex_unlock(&bc->table.mutex);
- continue;
- }
- for (i = 0; i < tbl->size; i++)
- while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
- ck = container_of(pos, struct bkey_cached, hash);
- BUG_ON(!bkey_cached_evict(bc, ck));
- kfree(ck->k);
- kmem_cache_free(bch2_key_cache, ck);
- }
- }
- rcu_read_unlock();
- }
-
- if (atomic_long_read(&bc->nr_dirty) &&
- !bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_was_rw, &c->flags))
- panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
- atomic_long_read(&bc->nr_dirty));
-
- if (atomic_long_read(&bc->nr_keys))
- panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
- atomic_long_read(&bc->nr_keys));
-
- if (bc->table_init_done)
- rhashtable_destroy(&bc->table);
-
- rcu_pending_exit(&bc->pending[0]);
- rcu_pending_exit(&bc->pending[1]);
-
- free_percpu(bc->nr_pending);
-}
-
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
-{
-}
-
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
- struct shrinker *shrink;
-
- bc->nr_pending = alloc_percpu(size_t);
- if (!bc->nr_pending)
- return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
- if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
- rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
- return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
- if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
- return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
-
- bc->table_init_done = true;
-
- shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
- if (!shrink)
- return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
- bc->shrink = shrink;
- shrink->count_objects = bch2_btree_key_cache_count;
- shrink->scan_objects = bch2_btree_key_cache_scan;
- shrink->batch = 1 << 14;
- shrink->seeks = 0;
- shrink->private_data = c;
- shrinker_register(shrink);
- return 0;
-}
-
-void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
-{
- printbuf_tabstop_push(out, 24);
- printbuf_tabstop_push(out, 12);
-
- prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
- prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
- prt_newline(out);
- prt_printf(out, "shrinker:\n");
- prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
- prt_printf(out, "freed:\t%lu\r\n", bc->freed);
- prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
- prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
- prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
- prt_newline(out);
- prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
-}
-
-void bch2_btree_key_cache_exit(void)
-{
- kmem_cache_destroy(bch2_key_cache);
-}
-
-int __init bch2_btree_key_cache_init(void)
-{
- bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
- if (!bch2_key_cache)
- return -ENOMEM;
-
- return 0;
-}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
deleted file mode 100644
index 82d8c72512a9..000000000000
--- a/fs/bcachefs/btree_key_cache.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
-#define _BCACHEFS_BTREE_KEY_CACHE_H
-
-static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = 1024 + nr_keys / 2;
-
- return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
-static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = 4096 + (nr_keys * 3) / 4;
-
- return nr_dirty - max_dirty;
-}
-
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
-{
- return __bch2_btree_key_cache_must_wait(c) > 0;
-}
-
-static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = 2048 + (nr_keys * 5) / 8;
-
- return nr_dirty <= max_dirty;
-}
-
-int bch2_btree_key_cache_journal_flush(struct journal *,
- struct journal_entry_pin *, u64);
-
-struct bkey_cached *
-bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-
-int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned);
-
-bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
- struct btree_insert_entry *);
-void bch2_btree_key_cache_drop(struct btree_trans *,
- struct btree_path *);
-
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
-void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
-
-void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
-
-void bch2_btree_key_cache_exit(void);
-int __init bch2_btree_key_cache_init(void);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
deleted file mode 100644
index 722f1ed10551..000000000000
--- a/fs/bcachefs/btree_key_cache_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-
-#include "rcu_pending.h"
-
-struct btree_key_cache {
- struct rhashtable table;
- bool table_init_done;
-
- struct shrinker *shrink;
- unsigned shrink_iter;
-
- /* 0: non pcpu reader locks, 1: pcpu reader locks */
- struct rcu_pending pending[2];
- size_t __percpu *nr_pending;
-
- atomic_long_t nr_keys;
- atomic_long_t nr_dirty;
-
- /* shrinker stats */
- unsigned long requested_to_free;
- unsigned long freed;
- unsigned long skipped_dirty;
- unsigned long skipped_accessed;
- unsigned long skipped_lock_fail;
-};
-
-struct bkey_cached_key {
- u32 btree_id;
- struct bpos pos;
-} __packed __aligned(4);
-
-#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
deleted file mode 100644
index bed2b4b6ffb9..000000000000
--- a/fs/bcachefs/btree_locking.c
+++ /dev/null
@@ -1,936 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_types.h"
-
-static struct lock_class_key bch2_btree_node_lock_key;
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
- enum six_lock_init_flags flags,
- gfp_t gfp)
-{
- __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
- lockdep_set_notrack_class(&b->lock);
-}
-
-/* Btree node locking: */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
- struct btree_path *skip,
- struct btree_bkey_cached_common *b,
- unsigned level)
-{
- struct btree_path *path;
- struct six_lock_count ret;
- unsigned i;
-
- memset(&ret, 0, sizeof(ret));
-
- if (IS_ERR_OR_NULL(b))
- return ret;
-
- trans_for_each_path(trans, path, i)
- if (path != skip && &path->l[level].b->c == b) {
- int t = btree_node_locked_type(path, level);
-
- if (t != BTREE_NODE_UNLOCKED)
- ret.n[t]++;
- }
-
- return ret;
-}
-
-/* unlock */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-/* lock */
-
-/*
- * @trans wants to lock @b with type @type
- */
-struct trans_waiting_for_lock {
- struct btree_trans *trans;
- struct btree_bkey_cached_common *node_want;
- enum six_lock_type lock_want;
-
- /* for iterating over held locks :*/
- u8 path_idx;
- u8 level;
- u64 lock_start_time;
-};
-
-struct lock_graph {
- struct trans_waiting_for_lock g[8];
- unsigned nr;
-};
-
-static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
-{
- struct trans_waiting_for_lock *i;
-
- prt_printf(out, "Found lock cycle (%u entries):\n", g->nr);
-
- for (i = g->g; i < g->g + g->nr; i++) {
- struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
- if (!task)
- continue;
-
- bch2_btree_trans_to_text(out, i->trans);
- bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
- }
-}
-
-static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
-{
- struct trans_waiting_for_lock *i;
-
- for (i = g->g; i != g->g + g->nr; i++) {
- struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
- if (i != g->g)
- prt_str(out, "<- ");
- prt_printf(out, "%u ", task ? task->pid : 0);
- }
- prt_newline(out);
-}
-
-static void lock_graph_up(struct lock_graph *g)
-{
- closure_put(&g->g[--g->nr].trans->ref);
-}
-
-static noinline void lock_graph_pop_all(struct lock_graph *g)
-{
- while (g->nr)
- lock_graph_up(g);
-}
-
-static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
- while (g->g + g->nr > i)
- lock_graph_up(g);
-}
-
-static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
- g->g[g->nr++] = (struct trans_waiting_for_lock) {
- .trans = trans,
- .node_want = trans->locking,
- .lock_want = trans->locking_wait.lock_want,
- };
-}
-
-static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
-{
- closure_get(&trans->ref);
- __lock_graph_down(g, trans);
-}
-
-static bool lock_graph_remove_non_waiters(struct lock_graph *g,
- struct trans_waiting_for_lock *from)
-{
- struct trans_waiting_for_lock *i;
-
- if (from->trans->locking != from->node_want) {
- lock_graph_pop_from(g, from);
- return true;
- }
-
- for (i = from + 1; i < g->g + g->nr; i++)
- if (i->trans->locking != i->node_want ||
- i->trans->locking_wait.start_time != i[-1].lock_start_time) {
- lock_graph_pop_from(g, i);
- return true;
- }
-
- return false;
-}
-
-static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
-
- count_event(c, trans_restart_would_deadlock);
-
- if (trace_trans_restart_would_deadlock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- buf.atomic++;
- print_cycle(&buf, g);
-
- trace_trans_restart_would_deadlock(trans, buf.buf);
- printbuf_exit(&buf);
- }
-}
-
-static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
-{
- if (i == g->g) {
- trace_would_deadlock(g, i->trans);
- return btree_trans_restart_foreign_task(i->trans,
- BCH_ERR_transaction_restart_would_deadlock,
- _THIS_IP_);
- } else {
- i->trans->lock_must_abort = true;
- wake_up_process(i->trans->locking_wait.task);
- return 0;
- }
-}
-
-static int btree_trans_abort_preference(struct btree_trans *trans)
-{
- if (trans->lock_may_not_fail)
- return 0;
- if (trans->locking_wait.lock_want == SIX_LOCK_write)
- return 1;
- if (!trans->in_traverse_all)
- return 2;
- return 3;
-}
-
-static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
-{
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
- for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
- struct btree_trans *trans = i->trans;
-
- bch2_btree_trans_to_text(&buf, trans);
-
- prt_printf(&buf, "backtrace:\n");
- printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
- bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- BUG();
-}
-
-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
- struct trans_waiting_for_lock *from)
-{
- struct trans_waiting_for_lock *i, *abort = NULL;
- unsigned best = 0, pref;
- int ret;
-
- if (lock_graph_remove_non_waiters(g, from))
- return 0;
-
- /* Only checking, for debugfs: */
- if (cycle) {
- print_cycle(cycle, g);
- ret = -1;
- goto out;
- }
-
- for (i = from; i < g->g + g->nr; i++) {
- pref = btree_trans_abort_preference(i->trans);
- if (pref > best) {
- abort = i;
- best = pref;
- }
- }
-
- if (unlikely(!best))
- break_cycle_fail(g);
-
- ret = abort_lock(g, abort);
-out:
- if (ret)
- lock_graph_pop_all(g);
- else
- lock_graph_pop_from(g, abort);
- return ret;
-}
-
-static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
- struct printbuf *cycle)
-{
- struct btree_trans *orig_trans = g->g->trans;
-
- for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
- if (i->trans == trans) {
- closure_put(&trans->ref);
- return break_cycle(g, cycle, i);
- }
-
- if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
- closure_put(&trans->ref);
-
- if (orig_trans->lock_may_not_fail)
- return 0;
-
- lock_graph_pop_all(g);
-
- if (cycle)
- return 0;
-
- trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
- return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
- }
-
- __lock_graph_down(g, trans);
- return 0;
-}
-
-static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
-{
- return t1 + t2 > 1;
-}
-
-int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
-{
- struct lock_graph g;
- struct trans_waiting_for_lock *top;
- struct btree_bkey_cached_common *b;
- btree_path_idx_t path_idx;
- int ret = 0;
-
- g.nr = 0;
-
- if (trans->lock_must_abort && !trans->lock_may_not_fail) {
- if (cycle)
- return -1;
-
- trace_would_deadlock(&g, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
- }
-
- lock_graph_down(&g, trans);
-
- /* trans->paths is rcu protected vs. freeing */
- guard(rcu)();
- if (cycle)
- cycle->atomic++;
-next:
- if (!g.nr)
- goto out;
-
- top = &g.g[g.nr - 1];
-
- struct btree_path *paths = rcu_dereference(top->trans->paths);
- if (!paths)
- goto up;
-
- unsigned long *paths_allocated = trans_paths_allocated(paths);
-
- trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
- path_idx, top->path_idx) {
- struct btree_path *path = paths + path_idx;
- if (!path->nodes_locked)
- continue;
-
- if (path_idx != top->path_idx) {
- top->path_idx = path_idx;
- top->level = 0;
- top->lock_start_time = 0;
- }
-
- for (;
- top->level < BTREE_MAX_DEPTH;
- top->level++, top->lock_start_time = 0) {
- int lock_held = btree_node_locked_type(path, top->level);
-
- if (lock_held == BTREE_NODE_UNLOCKED)
- continue;
-
- b = &READ_ONCE(path->l[top->level].b)->c;
-
- if (IS_ERR_OR_NULL(b)) {
- /*
- * If we get here, it means we raced with the
- * other thread updating its btree_path
- * structures - which means it can't be blocked
- * waiting on a lock:
- */
- if (!lock_graph_remove_non_waiters(&g, g.g)) {
- /*
- * If lock_graph_remove_non_waiters()
- * didn't do anything, it must be
- * because we're being called by debugfs
- * checking for lock cycles, which
- * invokes us on btree_transactions that
- * aren't actually waiting on anything.
- * Just bail out:
- */
- lock_graph_pop_all(&g);
- }
-
- goto next;
- }
-
- if (list_empty_careful(&b->lock.wait_list))
- continue;
-
- raw_spin_lock(&b->lock.wait_lock);
- list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
- BUG_ON(b != trans->locking);
-
- if (top->lock_start_time &&
- time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
- continue;
-
- top->lock_start_time = trans->locking_wait.start_time;
-
- /* Don't check for self deadlock: */
- if (trans == top->trans ||
- !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
- continue;
-
- closure_get(&trans->ref);
- raw_spin_unlock(&b->lock.wait_lock);
-
- ret = lock_graph_descend(&g, trans, cycle);
- if (ret)
- goto out;
- goto next;
-
- }
- raw_spin_unlock(&b->lock.wait_lock);
- }
- }
-up:
- if (g.nr > 1 && cycle)
- print_chain(cycle, &g);
- lock_graph_up(&g);
- goto next;
-out:
- if (cycle)
- --cycle->atomic;
- return ret;
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
-{
- struct btree_trans *trans = p;
-
- return bch2_check_for_deadlock(trans, NULL);
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
- struct btree_bkey_cached_common *b,
- bool lock_may_not_fail)
-{
- int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
- int ret;
-
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- six_lock_readers_add(&b->lock, -readers);
- ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
- lock_may_not_fail, _RET_IP_);
- six_lock_readers_add(&b->lock, readers);
-
- if (ret)
- mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
-
- return ret;
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
-{
- int ret = __btree_node_lock_write(trans, path, b, true);
- BUG_ON(ret);
-}
-
-/* relock */
-
-static int btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade,
- struct get_locks_fail *f,
- int restart_err)
-{
- unsigned l = path->level;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!(upgrade
- ? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l)))
- goto err;
-
- l++;
- } while (l < path->locks_want);
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
-
- return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1;
-err:
- if (f) {
- f->l = l;
- f->b = path->l[l].b;
- }
-
- /*
- * Do transaction restart before unlocking, so we don't pop
- * should_be_locked asserts
- */
- if (restart_err) {
- btree_trans_restart(trans, restart_err);
- } else if (path->should_be_locked && !trans->restarted) {
- if (upgrade)
- path->locks_want = l;
- return -1;
- }
-
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-
- /*
- * When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_path_traverse has to walk back up to
- * the node that we failed to relock:
- */
- do {
- path->l[l].b = upgrade
- ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
- : ERR_PTR(-BCH_ERR_no_btree_node_relock);
- } while (l--);
-
- return -restart_err ?: -1;
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level,
- bool trace)
-{
- struct btree *b = btree_path_node(path, level);
- int want = __btree_lock_want(path, level);
-
- if (race_fault())
- goto fail;
-
- if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, &b->c, level, want))) {
- mark_btree_node_locked(trans, path, level, want);
- return true;
- }
-fail:
- if (trace && !trans->notrace_relock_fail)
- trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
- return false;
-}
-
-/* upgrade */
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = path->l[level].b;
-
- if (!is_btree_node(path, level))
- return false;
-
- switch (btree_lock_want(path, level)) {
- case BTREE_NODE_UNLOCKED:
- BUG_ON(btree_node_locked(path, level));
- return true;
- case BTREE_NODE_READ_LOCKED:
- BUG_ON(btree_node_intent_locked(path, level));
- return bch2_btree_node_relock(trans, path, level);
- case BTREE_NODE_INTENT_LOCKED:
- break;
- case BTREE_NODE_WRITE_LOCKED:
- BUG();
- }
-
- if (btree_node_intent_locked(path, level))
- return true;
-
- if (race_fault())
- return false;
-
- if (btree_node_locked(path, level)
- ? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
- goto success;
-
- if (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(trans, path, level);
- goto success;
- }
-
- trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
- return false;
-success:
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
- return true;
-}
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned l;
-
- for (l = path->level;
- l < path->locks_want && btree_path_node(path, l);
- l++) {
- if (!bch2_btree_node_relock(trans, path, l)) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
- trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
- }
- }
-
- return 0;
-}
-
-__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
-{
- bool ret = !btree_path_get_locks(trans, path, false, NULL, 0);
- bch2_trans_verify_locks(trans);
- return ret;
-}
-
-int __bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- if (!bch2_btree_path_relock_norestart(trans, path)) {
- trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
- }
-
- return 0;
-}
-
-bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- path->locks_want = new_locks_want;
-
- /*
- * If we need it locked, we can't touch it. Otherwise, we can return
- * success - bch2_path_get() will use this path, and it'll just be
- * retraversed:
- */
- bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) ||
- !path->should_be_locked;
-
- bch2_btree_path_verify_locks(trans, path);
- return ret;
-}
-
-int __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- unsigned old_locks = path->nodes_locked;
- unsigned old_locks_want = path->locks_want;
-
- path->locks_want = max_t(unsigned, path->locks_want, new_locks_want);
-
- struct get_locks_fail f = {};
- int ret = btree_path_get_locks(trans, path, true, &f,
- BCH_ERR_transaction_restart_upgrade);
- if (!ret)
- goto out;
-
- /*
- * XXX: this is ugly - we'd prefer to not be mucking with other
- * iterators in the btree_trans here.
- *
- * On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_path_traverse()
- * get the locks we want on transaction restart.
- *
- * But if this iterator was a clone, on transaction restart what we did
- * to this iterator isn't going to be preserved.
- *
- * Possibly we could add an iterator field for the parent iterator when
- * an iterator is a copy - for now, we'll just upgrade any other
- * iterators with the same btree id.
- *
- * The code below used to be needed to ensure ancestor nodes get locked
- * before interior nodes - now that's handled by
- * bch2_btree_path_traverse_all().
- */
- if (!path->cached && !trans->in_traverse_all) {
- struct btree_path *linked;
- unsigned i;
-
- trans_for_each_path(trans, linked, i)
- if (linked != path &&
- linked->cached == path->cached &&
- linked->btree_id == path->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true, NULL, 0);
- }
- }
-
- count_event(trans->c, trans_restart_upgrade);
- if (trace_trans_restart_upgrade_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_);
- prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id));
- bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, "locks want %u -> %u level %u\n",
- old_locks_want, new_locks_want, f.l);
- prt_printf(&buf, "nodes_locked %x -> %x\n",
- old_locks, path->nodes_locked);
- prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) :
- !f.b ? "(null)" : "(node)");
- prt_printf(&buf, "path seq %u node seq %u\n",
- IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq,
- path->l[f.l].lock_seq);
-
- trace_trans_restart_upgrade(trans->c, buf.buf);
- printbuf_exit(&buf);
- }
-out:
- bch2_trans_verify_locks(trans);
- return ret;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- unsigned l, old_locks_want = path->locks_want;
-
- if (trans->restarted)
- return;
-
- EBUG_ON(path->locks_want < new_locks_want);
-
- path->locks_want = new_locks_want;
-
- while (path->nodes_locked &&
- (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
- if (l > path->level) {
- btree_node_unlock(trans, path, l);
- } else {
- if (btree_node_intent_locked(path, l)) {
- six_lock_downgrade(&path->l[l].b->c.lock);
- mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
- }
- break;
- }
- }
-
- bch2_btree_path_verify_locks(trans, path);
-
- trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
-}
-
-/* Btree transaction locking: */
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- if (trans->restarted)
- return;
-
- trans_for_each_path(trans, path, i)
- if (path->ref)
- bch2_btree_path_downgrade(trans, path);
-}
-
-static inline void __bch2_trans_unlock(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_unlock(trans, path);
-}
-
-static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
- struct get_locks_fail *f, bool trace, ulong ip)
-{
- if (!trace)
- goto out;
-
- if (trace_trans_restart_relock_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " %s l=%u seq=%u node seq=",
- bch2_btree_id_str(path->btree_id),
- f->l, path->l[f->l].lock_seq);
- if (IS_ERR_OR_NULL(f->b)) {
- prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
- } else {
- prt_printf(&buf, "%u", f->b->c.lock.seq);
-
- struct six_lock_count c =
- bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l);
- prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
-
- c = six_lock_counts(&f->b->c.lock);
- prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
- }
-
- trace_trans_restart_relock(trans, ip, buf.buf);
- printbuf_exit(&buf);
- }
-
- count_event(trans->c, trans_restart_relock);
-out:
- __bch2_trans_unlock(trans);
- bch2_trans_verify_locks(trans);
-}
-
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip)
-{
- bch2_trans_verify_locks(trans);
-
- if (unlikely(trans->restarted))
- return -((int) trans->restarted);
- if (unlikely(trans->locked))
- goto out;
-
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i) {
- struct get_locks_fail f;
- int ret;
-
- if (path->should_be_locked &&
- (ret = btree_path_get_locks(trans, path, false, &f,
- BCH_ERR_transaction_restart_relock))) {
- bch2_trans_relock_fail(trans, path, &f, trace, ip);
- return ret;
- }
- }
-
- trans_set_locked(trans, true);
-out:
- bch2_trans_verify_locks(trans);
- return 0;
-}
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
- return __bch2_trans_relock(trans, true, _RET_IP_);
-}
-
-int bch2_trans_relock_notrace(struct btree_trans *trans)
-{
- return __bch2_trans_relock(trans, false, _RET_IP_);
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
- trans_set_unlocked(trans);
-
- __bch2_trans_unlock(trans);
-}
-
-void bch2_trans_unlock_long(struct btree_trans *trans)
-{
- bch2_trans_unlock(trans);
- bch2_trans_srcu_unlock(trans);
-}
-
-void bch2_trans_unlock_write(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
- if (btree_node_write_locked(path, l))
- bch2_btree_node_unlock_write(trans, path, path->l[l].b);
-}
-
-int __bch2_trans_mutex_lock(struct btree_trans *trans,
- struct mutex *lock)
-{
- int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
-
- if (ret)
- mutex_unlock(lock);
- return ret;
-}
-
-/* Debug */
-
-void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path)
-{
- if (!path->nodes_locked && btree_path_node(path, path->level)) {
- /*
- * A path may be uptodate and yet have nothing locked if and only if
- * there is no node at path->level, which generally means we were
- * iterating over all nodes and got to the end of the btree
- */
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
- BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
- }
-
- if (!path->nodes_locked)
- return;
-
- for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
- int want = btree_lock_want(path, l);
- int have = btree_node_locked_type_nowrite(path, l);
-
- BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
-
- BUG_ON(is_btree_node(path, l) && want != have);
-
- BUG_ON(btree_node_locked(path, l) &&
- path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
- }
-}
-
-static bool bch2_trans_locked(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (path->nodes_locked)
- return true;
- return false;
-}
-
-void __bch2_trans_verify_locks(struct btree_trans *trans)
-{
- if (!trans->locked) {
- BUG_ON(bch2_trans_locked(trans));
- return;
- }
-
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- __bch2_btree_path_verify_locks(trans, path);
-}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
deleted file mode 100644
index f2173a3316f4..000000000000
--- a/fs/bcachefs/btree_locking.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_LOCKING_H
-#define _BCACHEFS_BTREE_LOCKING_H
-
-/*
- * Only for internal btree use:
- *
- * The btree iterator tracks what locks it wants to take, and what locks it
- * currently has - here we have wrappers for locking/unlocking btree nodes and
- * updating the iterator state
- */
-
-#include "btree_iter.h"
-#include "six.h"
-
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
-
-void bch2_trans_unlock_write(struct btree_trans *);
-
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
- return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
-}
-
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
- return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
- ? &trans->c->btree_transaction_stats[trans->fn_idx]
- : NULL;
-}
-
-/* matches six lock types */
-enum btree_node_locked_type {
- BTREE_NODE_UNLOCKED = -1,
- BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
- BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
- BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
-};
-
-static inline int btree_node_locked_type(struct btree_path *path,
- unsigned level)
-{
- return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
-}
-
-static inline int btree_node_locked_type_nowrite(struct btree_path *path,
- unsigned level)
-{
- int have = btree_node_locked_type(path, level);
- return have == BTREE_NODE_WRITE_LOCKED
- ? BTREE_NODE_INTENT_LOCKED
- : have;
-}
-
-static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
-{
- return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
-}
-
-static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
-{
- return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
-}
-
-static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
-{
- return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
-}
-
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
-{
- return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
-}
-
-static inline void mark_btree_node_locked_noreset(struct btree_path *path,
- unsigned level,
- enum btree_node_locked_type type)
-{
- /* relying on this to avoid a branch */
- BUILD_BUG_ON(SIX_LOCK_read != 0);
- BUILD_BUG_ON(SIX_LOCK_intent != 1);
-
- path->nodes_locked &= ~(3U << (level << 1));
- path->nodes_locked |= (type + 1) << (level << 1);
-}
-
-static inline void mark_btree_node_locked(struct btree_trans *trans,
- struct btree_path *path,
- unsigned level,
- enum btree_node_locked_type type)
-{
- mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- path->l[level].lock_taken_time = local_clock();
-#endif
-}
-
-static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
-{
- return level < path->locks_want
- ? SIX_LOCK_intent
- : SIX_LOCK_read;
-}
-
-static inline enum btree_node_locked_type
-btree_lock_want(struct btree_path *path, int level)
-{
- if (level < path->level)
- return BTREE_NODE_UNLOCKED;
- if (level < path->locks_want)
- return BTREE_NODE_INTENT_LOCKED;
- if (level == path->level)
- return BTREE_NODE_READ_LOCKED;
- return BTREE_NODE_UNLOCKED;
-}
-
-static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
- path->l[level].lock_taken_time,
- local_clock());
-#endif
-}
-
-/* unlock: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-
-static inline void btree_node_unlock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- int lock_type = btree_node_locked_type(path, level);
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- if (lock_type != BTREE_NODE_UNLOCKED) {
- if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
- bch2_btree_node_unlock_write(trans, path, path->l[level].b);
- lock_type = BTREE_NODE_INTENT_LOCKED;
- }
- six_unlock_type(&path->l[level].b->c.lock, lock_type);
- btree_trans_lock_hold_time_update(trans, path, level);
- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
- }
-}
-
-static inline int btree_path_lowest_level_locked(struct btree_path *path)
-{
- return __ffs(path->nodes_locked) >> 1;
-}
-
-static inline int btree_path_highest_level_locked(struct btree_path *path)
-{
- return __fls(path->nodes_locked) >> 1;
-}
-
-static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
- struct btree_path *path)
-{
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK);
-
- while (path->nodes_locked)
- btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
-}
-
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
-{
- if (!b->c.lock.write_lock_recurse) {
- struct btree_path *linked;
- unsigned i;
-
- trans_for_each_path_with_node(trans, b, linked, i)
- linked->l[b->c.level].lock_seq++;
- }
-
- six_unlock_write(&b->c.lock);
-}
-
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
- struct btree *b)
-{
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
- EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
-
- mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- __bch2_btree_node_unlock_write(trans, b);
-}
-
-int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
-
-/* lock: */
-
-static inline void trans_set_locked(struct btree_trans *trans, bool try)
-{
- if (!trans->locked) {
- lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
- trans->locked = true;
- trans->last_unlock_ip = 0;
-
- trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
- current->flags |= PF_MEMALLOC_NOFS;
- }
-}
-
-static inline void trans_set_unlocked(struct btree_trans *trans)
-{
- if (trans->locked) {
- lock_release(&trans->dep_map, _THIS_IP_);
- trans->locked = false;
- trans->last_unlock_ip = _RET_IP_;
-
- if (!trans->pf_memalloc_nofs)
- current->flags &= ~PF_MEMALLOC_NOFS;
- }
-}
-
-static inline int __btree_node_lock_nopath(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- enum six_lock_type type,
- bool lock_may_not_fail,
- unsigned long ip)
-{
- trans->lock_may_not_fail = lock_may_not_fail;
- trans->lock_must_abort = false;
- trans->locking = b;
-
- int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
- bch2_six_check_for_deadlock, trans, ip);
- WRITE_ONCE(trans->locking, NULL);
- WRITE_ONCE(trans->locking_wait.start_time, 0);
-
- if (!ret)
- trace_btree_path_lock(trans, _THIS_IP_, b);
- return ret;
-}
-
-static inline int __must_check
-btree_node_lock_nopath(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- enum six_lock_type type,
- unsigned long ip)
-{
- return __btree_node_lock_nopath(trans, b, type, false, ip);
-}
-
-static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- enum six_lock_type type)
-{
- int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
-
- BUG_ON(ret);
-}
-
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_trans *trans,
- struct btree_bkey_cached_common *b,
- unsigned level,
- enum btree_node_locked_type want)
-{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path(trans, path, i)
- if (&path->l[level].b->c == b &&
- btree_node_locked_type(path, level) >= want) {
- six_lock_increment(&b->lock, (enum six_lock_type) want);
- return true;
- }
-
- return false;
-}
-
-static inline int btree_node_lock(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b,
- unsigned level,
- enum six_lock_type type,
- unsigned long ip)
-{
- int ret = 0;
-
- EBUG_ON(level >= BTREE_MAX_DEPTH);
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- if (likely(six_trylock_type(&b->lock, type)) ||
- btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
- !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- path->l[b->level].lock_taken_time = local_clock();
-#endif
- }
-
- return ret;
-}
-
-int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
- struct btree_bkey_cached_common *b, bool);
-
-static inline int __btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b,
- bool lock_may_not_fail)
-{
- EBUG_ON(&path->l[b->level].b->c != b);
- EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
- EBUG_ON(!btree_node_intent_locked(path, b->level));
-
- /*
- * six locks are unfair, and read locks block while a thread wants a
- * write lock: thus, we need to tell the cycle detector we have a write
- * lock _before_ taking the lock:
- */
- mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
-
- return likely(six_trylock_write(&b->lock))
- ? 0
- : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
-}
-
-static inline int __must_check
-bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
-{
- return __btree_node_lock_write(trans, path, b, false);
-}
-
-void bch2_btree_node_lock_write_nofail(struct btree_trans *,
- struct btree_path *,
- struct btree_bkey_cached_common *);
-
-/* relock: */
-
-bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
-int __bch2_btree_path_relock(struct btree_trans *,
- struct btree_path *, unsigned long);
-
-static inline int bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- return btree_node_locked(path, path->level)
- ? 0
- : __bch2_btree_path_relock(trans, path, trace_ip);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
-
-static inline bool bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- EBUG_ON(btree_node_locked(path, level) &&
- !btree_node_write_locked(path, level) &&
- btree_node_locked_type(path, level) != __btree_lock_want(path, level));
-
- return likely(btree_node_locked(path, level)) ||
- (!IS_ERR_OR_NULL(path->l[level].b) &&
- __bch2_btree_node_relock(trans, path, level, true));
-}
-
-static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- EBUG_ON(btree_node_locked(path, level) &&
- btree_node_locked_type_nowrite(path, level) !=
- __btree_lock_want(path, level));
-
- return likely(btree_node_locked(path, level)) ||
- (!IS_ERR_OR_NULL(path->l[level].b) &&
- __bch2_btree_node_relock(trans, path, level, false));
-}
-
-/* upgrade */
-
-bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned);
-
-static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- return new_locks_want > path->locks_want
- ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want)
- : true;
-}
-
-int __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
-
-static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
- return likely(path->locks_want >= new_locks_want && path->nodes_locked)
- ? 0
- : __bch2_btree_path_upgrade(trans, path, new_locks_want);
-}
-
-/* misc: */
-
-static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
-{
- EBUG_ON(!btree_node_locked(path, path->level));
- EBUG_ON(path->uptodate);
-
- if (!path->should_be_locked) {
- path->should_be_locked = true;
- trace_btree_path_should_be_locked(trans, path);
- }
-}
-
-static inline void __btree_path_set_level_up(struct btree_trans *trans,
- struct btree_path *path,
- unsigned l)
-{
- btree_node_unlock(trans, path, l);
- path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
-}
-
-static inline void btree_path_set_level_up(struct btree_trans *trans,
- struct btree_path *path)
-{
- __btree_path_set_level_up(trans, path, path->level++);
- btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
-}
-
-/* debug */
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
- struct btree_path *,
- struct btree_bkey_cached_common *b,
- unsigned);
-
-int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-
-void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *);
-void __bch2_trans_verify_locks(struct btree_trans *);
-
-static inline void bch2_btree_path_verify_locks(struct btree_trans *trans,
- struct btree_path *path)
-{
- if (static_branch_unlikely(&bch2_debug_check_btree_locking))
- __bch2_btree_path_verify_locks(trans, path);
-}
-
-static inline void bch2_trans_verify_locks(struct btree_trans *trans)
-{
- if (static_branch_unlikely(&bch2_debug_check_btree_locking))
- __bch2_trans_verify_locks(trans);
-}
-
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
deleted file mode 100644
index a3fb07c60e25..000000000000
--- a/fs/bcachefs/btree_node_scan.c
+++ /dev/null
@@ -1,611 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "recovery_passes.h"
-
-#include <linux/kthread.h>
-#include <linux/min_heap.h>
-#include <linux/sched/sysctl.h>
-#include <linux/sort.h>
-
-struct find_btree_nodes_worker {
- struct closure *cl;
- struct find_btree_nodes *f;
- struct bch_dev *ca;
-};
-
-static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
-{
- bch2_btree_id_level_to_text(out, n->btree_id, n->level);
- prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
- n->seq, n->journal_seq, n->cookie);
- bch2_bpos_to_text(out, n->min_key);
- prt_str(out, "-");
- bch2_bpos_to_text(out, n->max_key);
-
- if (n->range_updated)
- prt_str(out, " range updated");
-
- for (unsigned i = 0; i < n->nr_ptrs; i++) {
- prt_char(out, ' ');
- bch2_extent_ptr_to_text(out, c, n->ptrs + i);
- }
-}
-
-static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
-{
- printbuf_indent_add(out, 2);
- darray_for_each(nodes, i) {
- found_btree_node_to_text(out, c, i);
- prt_newline(out);
- }
- printbuf_indent_sub(out, 2);
-}
-
-static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
-{
- struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
-
- set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
- bp->k.p = f->max_key;
- bp->v.seq = cpu_to_le64(f->cookie);
- bp->v.sectors_written = 0;
- bp->v.flags = 0;
- bp->v.sectors_written = cpu_to_le16(f->sectors_written);
- bp->v.min_key = f->min_key;
- SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
- memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
-}
-
-static inline u64 bkey_journal_seq(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode_v3:
- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
- default:
- return 0;
- }
-}
-
-static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
-{
- const struct found_btree_node *l = _l;
- const struct found_btree_node *r = _r;
-
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- cmp_int(l->cookie, r->cookie);
-}
-
-/*
- * Given two found btree nodes, if their sequence numbers are equal, take the
- * one that's readable:
- */
-static int found_btree_node_cmp_time(const struct found_btree_node *l,
- const struct found_btree_node *r)
-{
- return cmp_int(l->seq, r->seq) ?:
- cmp_int(l->journal_seq, r->journal_seq);
-}
-
-static int found_btree_node_cmp_pos(const void *_l, const void *_r)
-{
- const struct found_btree_node *l = _l;
- const struct found_btree_node *r = _r;
-
- return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->min_key, r->min_key) ?:
- -found_btree_node_cmp_time(l, r);
-}
-
-static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
-{
- return found_btree_node_cmp_pos(l, r) < 0;
-}
-
-static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
-{
- struct found_btree_node *l = _l;
- struct found_btree_node *r = _r;
-
- swap(*l, *r);
-}
-
-static const struct min_heap_callbacks found_btree_node_heap_cbs = {
- .less = found_btree_node_cmp_pos_less,
- .swp = found_btree_node_swap,
-};
-
-static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
- struct btree *b, struct bio *bio, u64 offset)
-{
- struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
- struct btree_node *bn = b->data;
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
- bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, b->data, c->opts.block_size);
-
- u64 submit_time = local_clock();
- submit_bio_wait(bio);
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca,
- "IO error in try_read_btree_node() at %llu: %s",
- offset, bch2_blk_status_to_str(bio->bi_status));
- return;
- }
-
- if (le64_to_cpu(bn->magic) != bset_magic(c))
- return;
-
- if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
- if (!c->chacha20_key_set)
- return;
-
- struct nonce nonce = btree_nonce(&bn->keys, 0);
- unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
-
- bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
- }
-
- if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
- return;
-
- if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
- return;
-
- if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
- return;
-
- rcu_read_lock();
- struct found_btree_node n = {
- .btree_id = BTREE_NODE_ID(bn),
- .level = BTREE_NODE_LEVEL(bn),
- .seq = BTREE_NODE_SEQ(bn),
- .cookie = le64_to_cpu(bn->keys.seq),
- .min_key = bn->min_key,
- .max_key = bn->max_key,
- .nr_ptrs = 1,
- .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
- .ptrs[0].offset = offset,
- .ptrs[0].dev = ca->dev_idx,
- .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
- };
- rcu_read_unlock();
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
- bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, b->data, c->opts.btree_node_size);
-
- submit_time = local_clock();
- submit_bio_wait(bio);
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
-
- found_btree_node_to_key(&b->key, &n);
-
- CLASS(printbuf, buf)();
- if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
- /* read_done will swap out b->data for another buffer */
- bn = b->data;
- /*
- * Grab journal_seq here because we want the max journal_seq of
- * any bset; read_done sorts down to a single set and picks the
- * max journal_seq
- */
- n.journal_seq = le64_to_cpu(bn->keys.journal_seq),
- n.sectors_written = b->written;
-
- mutex_lock(&f->lock);
- if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
- bch_err(c, "try_read_btree_node() can't handle endian conversion");
- f->ret = -EINVAL;
- goto unlock;
- }
-
- if (darray_push(&f->nodes, n))
- f->ret = -ENOMEM;
-unlock:
- mutex_unlock(&f->lock);
- }
-}
-
-static int read_btree_nodes_worker(void *p)
-{
- struct find_btree_nodes_worker *w = p;
- struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
- struct bch_dev *ca = w->ca;
- unsigned long last_print = jiffies;
- struct btree *b = NULL;
- struct bio *bio = NULL;
-
- b = __bch2_btree_node_mem_alloc(c);
- if (!b) {
- bch_err(c, "read_btree_nodes_worker: error allocating buf");
- w->f->ret = -ENOMEM;
- goto err;
- }
-
- bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
- if (!bio) {
- bch_err(c, "read_btree_nodes_worker: error allocating bio");
- w->f->ret = -ENOMEM;
- goto err;
- }
-
- for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
- for (unsigned bucket_offset = 0;
- bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
- bucket_offset += btree_sectors(c)) {
- if (time_after(jiffies, last_print + HZ * 30)) {
- u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
- u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
-
- bch_info(ca, "%s: %2u%% done", __func__,
- (unsigned) div64_u64(cur_sector * 100, end_sector));
- last_print = jiffies;
- }
-
- u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
- !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
- continue;
-
- try_read_btree_node(w->f, ca, b, bio, sector);
- }
-err:
- if (b)
- __btree_node_data_free(b);
- kfree(b);
- bio_put(bio);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
- closure_put(w->cl);
- kfree(w);
- return 0;
-}
-
-static int read_btree_nodes(struct find_btree_nodes *f)
-{
- struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
- struct closure cl;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
- if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
- continue;
-
- struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- if (!w) {
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
- ret = -ENOMEM;
- goto err;
- }
-
- w->cl = &cl;
- w->f = f;
- w->ca = ca;
-
- struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
- ret = PTR_ERR_OR_ZERO(t);
- if (ret) {
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
- kfree(w);
- bch_err_msg(c, ret, "starting kthread");
- break;
- }
-
- closure_get(&cl);
- enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
- wake_up_process(t);
- }
-err:
- while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
- ;
- return f->ret ?: ret;
-}
-
-static bool nodes_overlap(const struct found_btree_node *l,
- const struct found_btree_node *r)
-{
- return (l->btree_id == r->btree_id &&
- l->level == r->level &&
- bpos_gt(l->max_key, r->min_key));
-}
-
-static int handle_overwrites(struct bch_fs *c,
- struct found_btree_node *l,
- found_btree_nodes *nodes_heap)
-{
- struct found_btree_node *r;
-
- while ((r = min_heap_peek(nodes_heap)) &&
- nodes_overlap(l, r)) {
- int cmp = found_btree_node_cmp_time(l, r);
-
- if (cmp > 0) {
- if (bpos_cmp(l->max_key, r->max_key) >= 0)
- min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
- else {
- r->range_updated = true;
- r->min_key = bpos_successor(l->max_key);
- r->range_updated = true;
- min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
- }
- } else if (cmp < 0) {
- BUG_ON(bpos_eq(l->min_key, r->min_key));
-
- l->max_key = bpos_predecessor(r->min_key);
- l->range_updated = true;
- } else if (r->level) {
- min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
- } else {
- if (bpos_cmp(l->max_key, r->max_key) >= 0)
- min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
- else {
- r->range_updated = true;
- r->min_key = bpos_successor(l->max_key);
- r->range_updated = true;
- min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
- }
- }
-
- cond_resched();
- }
-
- return 0;
-}
-
-int bch2_scan_for_btree_nodes(struct bch_fs *c)
-{
- struct find_btree_nodes *f = &c->found_btree_nodes;
- struct printbuf buf = PRINTBUF;
- found_btree_nodes nodes_heap = {};
- size_t dst;
- int ret = 0;
-
- if (f->nodes.nr)
- return 0;
-
- mutex_init(&f->lock);
-
- ret = read_btree_nodes(f);
- if (ret)
- return ret;
-
- if (!f->nodes.nr) {
- bch_err(c, "%s: no btree nodes found", __func__);
- ret = -EINVAL;
- goto err;
- }
-
- if (0 && c->opts.verbose) {
- printbuf_reset(&buf);
- prt_printf(&buf, "%s: nodes found:\n", __func__);
- found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_str(c, KERN_INFO, buf.buf);
- }
-
- sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
-
- dst = 0;
- darray_for_each(f->nodes, i) {
- struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
-
- if (prev &&
- prev->cookie == i->cookie) {
- if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
- bch_err(c, "%s: found too many replicas for btree node", __func__);
- ret = -EINVAL;
- goto err;
- }
- prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
- } else {
- f->nodes.data[dst++] = *i;
- }
- }
- f->nodes.nr = dst;
-
- sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-
- if (0 && c->opts.verbose) {
- printbuf_reset(&buf);
- prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
- found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_str(c, KERN_INFO, buf.buf);
- }
-
- swap(nodes_heap, f->nodes);
-
- {
- /* darray must have same layout as a heap */
- min_heap_char real_heap;
- BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
- BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
- BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
- BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
- }
-
- min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
-
- if (nodes_heap.nr) {
- ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
- if (ret)
- goto err;
-
- min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
- }
-
- while (true) {
- ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
- if (ret)
- goto err;
-
- if (!nodes_heap.nr)
- break;
-
- ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
- if (ret)
- goto err;
-
- min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
- }
-
- for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
- BUG_ON(nodes_overlap(n, n + 1));
-
- if (0 && c->opts.verbose) {
- printbuf_reset(&buf);
- prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
- found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_str(c, KERN_INFO, buf.buf);
- } else {
- bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
- }
-
- eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
-err:
- darray_exit(&nodes_heap);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
-{
- const struct found_btree_node *l = _l;
- const struct found_btree_node *r = _r;
-
- return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->max_key, r->min_key);
-}
-
-#define for_each_found_btree_node_in_range(_f, _search, _idx) \
- for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
- sizeof((_f)->nodes.data[0]), \
- found_btree_node_range_start_cmp, &search); \
- _idx < (_f)->nodes.nr && \
- (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
- (_f)->nodes.data[_idx].level == _search.level && \
- bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
- _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
-
-bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
-{
- struct find_btree_nodes *f = &c->found_btree_nodes;
-
- struct found_btree_node search = {
- .btree_id = b->c.btree_id,
- .level = b->c.level,
- .min_key = b->data->min_key,
- .max_key = b->key.k.p,
- };
-
- for_each_found_btree_node_in_range(f, search, idx)
- if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
- return true;
- return false;
-}
-
-int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
-{
- int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
- if (ret)
- return ret;
-
- struct found_btree_node search = {
- .btree_id = btree,
- .level = 0,
- .min_key = POS_MIN,
- .max_key = SPOS_MAX,
- };
-
- for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
- return true;
- return false;
-}
-
-int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
- unsigned level, struct bpos node_min, struct bpos node_max)
-{
- if (btree_id_is_alloc(btree))
- return 0;
-
- struct find_btree_nodes *f = &c->found_btree_nodes;
-
- int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
- if (ret)
- return ret;
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "recovery ");
- bch2_btree_id_level_to_text(&buf, btree, level);
- prt_str(&buf, " ");
- bch2_bpos_to_text(&buf, node_min);
- prt_str(&buf, " - ");
- bch2_bpos_to_text(&buf, node_max);
-
- bch_info(c, "%s(): %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- struct found_btree_node search = {
- .btree_id = btree,
- .level = level,
- .min_key = node_min,
- .max_key = node_max,
- };
-
- for_each_found_btree_node_in_range(f, search, idx) {
- struct found_btree_node n = f->nodes.data[idx];
-
- n.range_updated |= bpos_lt(n.min_key, node_min);
- n.min_key = bpos_max(n.min_key, node_min);
-
- n.range_updated |= bpos_gt(n.max_key, node_max);
- n.max_key = bpos_min(n.max_key, node_max);
-
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
- found_btree_node_to_key(&tmp.k, &n);
-
- if (c->opts.verbose) {
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
- bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
- printbuf_exit(&buf);
- }
-
- BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
- (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = level + 1,
- .btree = btree,
- }));
-
- ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
-{
- darray_exit(&f->nodes);
-}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
deleted file mode 100644
index 66e6f9ed19d0..000000000000
--- a/fs/bcachefs/btree_node_scan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
-#define _BCACHEFS_BTREE_NODE_SCAN_H
-
-int bch2_scan_for_btree_nodes(struct bch_fs *);
-bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
-int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
-void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
deleted file mode 100644
index 2811b6857c97..000000000000
--- a/fs/bcachefs/btree_node_scan_types.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
-
-#include "darray.h"
-
-struct found_btree_node {
- bool range_updated:1;
- u8 btree_id;
- u8 level;
- unsigned sectors_written;
- u32 seq;
- u64 journal_seq;
- u64 cookie;
-
- struct bpos min_key;
- struct bpos max_key;
-
- unsigned nr_ptrs;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
-};
-
-typedef DARRAY(struct found_btree_node) found_btree_nodes;
-
-struct find_btree_nodes {
- int ret;
- struct mutex lock;
- found_btree_nodes nodes;
-};
-
-#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
deleted file mode 100644
index 639ef75b3dbd..000000000000
--- a/fs/bcachefs/btree_trans_commit.c
+++ /dev/null
@@ -1,1121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "snapshot.h"
-
-#include <linux/prefetch.h>
-#include <linux/string_helpers.h>
-
-static const char * const trans_commit_flags_strs[] = {
-#define x(n, ...) #n,
- BCH_TRANS_COMMIT_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit_flags flags)
-{
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
- prt_printf(out, "watermark=%s", bch2_watermarks[watermark]);
-
- flags >>= BCH_WATERMARK_BITS;
- if (flags) {
- prt_char(out, ' ');
- bch2_prt_bitflags(out, trans_commit_flags_strs, flags);
- }
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
- if (j_k)
- k = bkey_i_to_s_c(j_k);
- }
-
- u = *k.k;
- u.needs_whiteout = i->old_k.needs_whiteout;
-
- BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
- BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- return (trans->paths + i->path)->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i != trans->updates &&
- insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i + 1 < trans->updates + trans->nr_updates &&
- insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- if (unlikely(btree_node_just_written(b)) &&
- bch2_btree_post_write_cleanup(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-
- /*
- * If the last bset has been written, or if it's gotten too big - start
- * a new bset to insert into:
- */
- if (want_new_bset(c, b))
- bch2_btree_init_next(trans, b);
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int bch2_trans_lock_write(struct btree_trans *trans)
-{
- EBUG_ON(trans->write_locked);
-
- trans_for_each_update(trans, i) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
- return trans_lock_write_fail(trans, i);
-
- if (!i->cached)
- bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
- }
-
- trans->write_locked = true;
- return 0;
-}
-
-static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
-{
- if (likely(trans->write_locked)) {
- trans_for_each_update(trans, i)
- if (btree_node_locked_type(trans->paths + i->path, i->level) ==
- BTREE_NODE_WRITE_LOCKED)
- bch2_btree_node_unlock_write_inlined(trans,
- trans->paths + i->path, insert_l(trans, i)->b);
- trans->write_locked = false;
- }
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_i *insert)
-{
- struct bkey_packed *k;
- unsigned clobber_u64s = 0, new_u64s = 0;
-
- EBUG_ON(btree_node_just_written(b));
- EBUG_ON(bset_written(b, btree_bset_last(b)));
- EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
- EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
- EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
- kmsan_check_memory(insert, bkey_bytes(&insert->k));
-
- k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
- k = NULL;
-
- /* @k is the key being overwritten/deleted, if any: */
- EBUG_ON(k && bkey_deleted(k));
-
- /* Deleting, but not found? nothing to do: */
- if (bkey_deleted(&insert->k) && !k)
- return false;
-
- if (bkey_deleted(&insert->k)) {
- /* Deleting: */
- btree_account_key_drop(b, k);
- k->type = KEY_TYPE_deleted;
-
- if (k->needs_whiteout)
- push_whiteout(b, insert->k.p);
- k->needs_whiteout = false;
-
- if (k >= btree_bset_last(b)->start) {
- clobber_u64s = k->u64s;
- bch2_bset_delete(b, k, clobber_u64s);
- goto fix_iter;
- } else {
- bch2_btree_path_fix_key_modified(trans, b, k);
- }
-
- return true;
- }
-
- if (k) {
- /* Overwriting: */
- btree_account_key_drop(b, k);
- k->type = KEY_TYPE_deleted;
-
- insert->k.needs_whiteout = k->needs_whiteout;
- k->needs_whiteout = false;
-
- if (k >= btree_bset_last(b)->start) {
- clobber_u64s = k->u64s;
- goto overwrite;
- } else {
- bch2_btree_path_fix_key_modified(trans, b, k);
- }
- }
-
- k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
- bch2_bset_insert(b, k, insert, clobber_u64s);
- new_u64s = k->u64s;
-fix_iter:
- if (clobber_u64s != new_u64s)
- bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
- clobber_u64s, new_u64s);
- return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
- unsigned i, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct btree_write *w = container_of(pin, struct btree_write, journal);
- struct btree *b = container_of(w, struct btree, writes[i]);
- struct btree_trans *trans = bch2_trans_get(c);
- unsigned long old, new;
- unsigned idx = w - b->writes;
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- if (!(old & (1 << BTREE_NODE_dirty)) ||
- !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
- w->journal.seq != seq)
- break;
-
- new &= ~BTREE_WRITE_TYPE_MASK;
- new |= BTREE_WRITE_journal_reclaim;
- new |= 1 << BTREE_NODE_need_write;
- } while (!try_cmpxchg(&b->flags, &old, new));
-
- btree_node_write_if_need(trans, b, SIX_LOCK_read);
- six_unlock_read(&b->c.lock);
-
- bch2_trans_put(trans);
- return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
- return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
- return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
- struct btree *b, u64 seq)
-{
- struct btree_write *w = btree_current_write(b);
-
- bch2_journal_pin_add(&c->journal, seq, &w->journal,
- btree_node_write_idx(b) == 0
- ? bch2_btree_node_flush0
- : bch2_btree_node_flush1);
-}
-
-/**
- * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
- * @trans: btree transaction object
- * @path: path pointing to @insert's pos
- * @insert: key to insert
- * @journal_seq: sequence number of journal reservation
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_path *path,
- struct bkey_i *insert,
- u64 journal_seq)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(path)->b;
- struct bset_tree *t = bset_tree_last(b);
- struct bset *i = bset(b, t);
- int old_u64s = bset_u64s(t);
- int old_live_u64s = b->nr.live_u64s;
- int live_u64s_added, u64s_added;
-
- if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
- &path_l(path)->iter, insert)))
- return;
-
- i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
-
- if (unlikely(!btree_node_dirty(b))) {
- EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- set_btree_node_dirty_acct(c, b);
- }
-
- live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) bset_u64s(t) - old_u64s;
-
- if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
- if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
- if (u64s_added > live_u64s_added &&
- bch2_maybe_compact_whiteouts(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- struct btree_path *path = trans->paths + i->path;
-
- BUG_ON(!bpos_eq(i->k->k.p, path->pos));
- BUG_ON(i->cached != path->cached);
- BUG_ON(i->level != path->level);
- BUG_ON(i->btree_id != path->btree_id);
- BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
- EBUG_ON(!i->level &&
- btree_type_has_snapshots(i->btree_id) &&
- !(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
- test_bit(JOURNAL_replay_done, &trans->c->journal.flags) &&
- i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
- unsigned flags)
-{
- return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
- trans->journal_u64s, flags, trans);
-}
-
-#define JSET_ENTRY_LOG_U64s 4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct jset_entry *entry =
- bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_log, 0, 0,
- JSET_ENTRY_LOG_U64s);
- struct jset_entry_log *l =
- container_of(entry, struct jset_entry_log, entry);
-
- memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64),
- trans->fn, strlen(trans->fn), 0);
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
- struct btree *b, unsigned u64s)
-{
- if (!bch2_btree_node_insert_fits(b, u64s))
- return bch_err_throw(trans->c, btree_insert_btree_node_full);
-
- return 0;
-}
-
-noinline static int
-btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
- struct btree_path *path, unsigned new_u64s)
-{
- struct bkey_cached *ck = (void *) path->l[0].b;
- struct bkey_i *new_k;
- int ret;
-
- bch2_trans_unlock_updates_write(trans);
- bch2_trans_unlock(trans);
-
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
- if (!new_k) {
- struct bch_fs *c = trans->c;
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(path->btree_id), new_u64s);
- return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
- }
-
- ret = bch2_trans_relock(trans) ?:
- bch2_trans_lock_write(trans);
- if (unlikely(ret)) {
- kfree(new_k);
- return ret;
- }
-
- memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
-
- trans_for_each_update(trans, i)
- if (i->old_v == &ck->k->v)
- i->old_v = &new_k->v;
-
- kfree(ck->k);
- ck->u64s = new_u64s;
- ck->k = new_k;
- return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
- struct btree_path *path, unsigned u64s)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) path->l[0].b;
- unsigned new_u64s;
- struct bkey_i *new_k;
- unsigned watermark = flags & BCH_WATERMARK_MASK;
-
- EBUG_ON(path->level);
-
- if (watermark < BCH_WATERMARK_reclaim &&
- !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c))
- return bch_err_throw(c, btree_insert_need_journal_reclaim);
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at most 7
- * bytes (it won't be used):
- */
- u64s += 1;
-
- if (u64s <= ck->u64s)
- return 0;
-
- new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_k))
- return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
-
- trans_for_each_update(trans, i)
- if (i->old_v == &ck->k->v)
- i->old_v = &new_k->v;
-
- ck->u64s = new_u64s;
- ck->k = new_k;
- return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
- struct btree_insert_entry *i,
- unsigned flags)
-{
- verify_update_old_key(trans, i);
-
- if (unlikely(flags & BTREE_TRIGGER_norun))
- return 0;
-
- struct bkey_s_c old = { &i->old_k, i->old_v };
- struct bkey_i *new = i->k;
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
- if (old_ops->trigger == new_ops->trigger)
- return bch2_key_trigger(trans, i->btree_id, i->level,
- old, bkey_i_to_s(new),
- BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
- else
- return bch2_key_trigger_new(trans, i->btree_id, i->level,
- bkey_i_to_s(new), flags) ?:
- bch2_key_trigger_old(trans, i->btree_id, i->level,
- old, flags);
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- verify_update_old_key(trans, i);
-
- if ((i->flags & BTREE_TRIGGER_norun) ||
- !btree_node_type_has_trans_triggers(i->bkey_type))
- return 0;
-
- /*
- * Transactional triggers create new btree_insert_entries, so we can't
- * pass them a pointer to a btree_insert_entry, that memory is going to
- * move:
- */
- struct bkey old_k = i->old_k;
- struct bkey_s_c old = { &old_k, i->old_v };
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- unsigned flags = i->flags|BTREE_TRIGGER_transactional;
-
- if (!i->insert_trigger_run &&
- !i->overwrite_trigger_run &&
- old_ops->trigger == new_ops->trigger) {
- i->overwrite_trigger_run = true;
- i->insert_trigger_run = true;
- return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
- BTREE_TRIGGER_insert|
- BTREE_TRIGGER_overwrite|flags) ?: 1;
- } else if (!i->overwrite_trigger_run) {
- i->overwrite_trigger_run = true;
- return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
- } else if (!i->insert_trigger_run) {
- i->insert_trigger_run = true;
- return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
- } else {
- return 0;
- }
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- unsigned sort_id_start = 0;
-
- while (sort_id_start < trans->nr_updates) {
- unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
- bool trans_trigger_run;
-
- /*
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being
- * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
- * references before they are re-added.
- *
- * Running triggers will append more updates to the list of
- * updates as we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (i = sort_id_start;
- i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
- i++) {
- if (trans->updates[i].sort_order < sort_id) {
- sort_id_start = i;
- continue;
- }
-
- int ret = run_one_trans_trigger(trans, trans->updates + i);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
-
- sort_id_start = i;
- }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
- btree_node_type_has_trans_triggers(i->bkey_type) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
- return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
- trans_for_each_update(trans, i)
- if (btree_node_type_has_triggers(i->bkey_type) &&
- gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
- int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry **stopped_at,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_trans_commit_hook *h;
- unsigned u64s = 0;
- int ret = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-#if 0
- /* todo: bring back dynamic fault injection */
- if (race_fault()) {
- trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
- }
-#endif
- /*
- * Check if the insert will fit in the leaf node with the write lock
- * held, otherwise another thread could write the node changing the
- * amount of space available:
- */
-
- prefetch(&trans->c->journal.flags);
-
- trans_for_each_update(trans, i) {
- /* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, i))
- u64s = 0;
-
- u64s += i->k->k.u64s;
- ret = !i->cached
- ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
- : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
- if (ret) {
- *stopped_at = i;
- return ret;
- }
-
- i->k->k.needs_whiteout = false;
- }
-
- /*
- * Don't get journal reservation until after we know insert will
- * succeed:
- */
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
- ret = bch2_trans_journal_res_get(trans,
- (flags & BCH_WATERMARK_MASK)|
- JOURNAL_RES_GET_NONBLOCK);
- if (ret)
- return ret;
-
- if (unlikely(trans->journal_transaction_names))
- journal_transaction_name(trans);
- }
-
- /*
- * Not allowed to fail after we've gotten our journal reservation - we
- * have to use it:
- */
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
- if (static_branch_unlikely(&bch2_journal_seq_verify))
- trans_for_each_update(trans, i)
- i->k->k.bversion.lo = trans->journal_res.seq;
- else if (static_branch_unlikely(&bch2_inject_invalid_keys))
- trans_for_each_update(trans, i)
- i->k->k.bversion = MAX_VERSION;
- }
-
- h = trans->hooks;
- while (h) {
- ret = h->fn(trans, h);
- if (ret)
- return ret;
- h = h->next;
- }
-
- struct bkey_i *accounting;
-
- percpu_down_read(&c->mark_lock);
- for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
- accounting != btree_trans_subbuf_top(trans, &trans->accounting);
- accounting = bkey_next(accounting)) {
- ret = bch2_accounting_trans_commit_hook(trans,
- bkey_i_to_accounting(accounting), flags);
- if (ret)
- goto revert_fs_usage;
- }
- percpu_up_read(&c->mark_lock);
-
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
-
- trans_for_each_update(trans, i)
- if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
- if (ret)
- goto fatal_err;
- }
-
- if (unlikely(c->gc_pos.phase)) {
- ret = bch2_trans_commit_run_gc_triggers(trans);
- if (ret)
- goto fatal_err;
- }
-
- struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit };
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
- i != btree_trans_journal_entries_top(trans);
- i = vstruct_next(i)) {
- ret = bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, validate_context);
- if (unlikely(ret)) {
- bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
- trans->fn);
- goto fatal_err;
- }
- }
-
- trans_for_each_update(trans, i) {
- validate_context.level = i->level;
- validate_context.btree = i->btree_id;
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
- if (unlikely(ret)){
- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- goto fatal_err;
- }
- btree_insert_entry_checks(trans, i);
- }
-
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
- struct journal *j = &c->journal;
- struct jset_entry *entry;
-
- trans_for_each_update(trans, i) {
- if (i->key_cache_already_flushed)
- continue;
-
- if (i->flags & BTREE_UPDATE_nojournal)
- continue;
-
- verify_update_old_key(trans, i);
-
- if (trans->journal_transaction_names) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_overwrite,
- i->btree_id, i->level,
- i->old_k.u64s);
- bkey_reassemble((struct bkey_i *) entry->start,
- (struct bkey_s_c) { &i->old_k, i->old_v });
- }
-
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- i->btree_id, i->level,
- i->k->k.u64s);
- bkey_copy((struct bkey_i *) entry->start, i->k);
- }
-
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- btree_trans_journal_entries_start(trans),
- trans->journal_entries.u64s);
-
- EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s);
-
- trans->journal_res.offset += trans->journal_entries.u64s;
- trans->journal_res.u64s -= trans->journal_entries.u64s;
-
- memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_write_buffer_keys,
- BTREE_ID_accounting, 0,
- trans->accounting.u64s)->_data,
- btree_trans_subbuf_base(trans, &trans->accounting),
- trans->accounting.u64s);
-
- if (trans->journal_seq)
- *trans->journal_seq = trans->journal_res.seq;
- }
-
- trans_for_each_update(trans, i) {
- struct btree_path *path = trans->paths + i->path;
-
- if (!i->cached)
- bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
- else if (!i->key_cache_already_flushed)
- bch2_btree_insert_key_cached(trans, flags, i);
- else
- bch2_btree_key_cache_drop(trans, path);
- }
-
- return 0;
-fatal_err:
- bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
- percpu_down_read(&c->mark_lock);
-revert_fs_usage:
- for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
- i != accounting;
- i = bkey_next(i))
- bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags);
- percpu_up_read(&c->mark_lock);
- return ret;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
- /*
- * Accounting keys aren't deduped in the journal: we have to compare
- * each individual update against what's in the btree to see if it has
- * been applied yet, and accounting updates also don't overwrite,
- * they're deltas that accumulate.
- */
- trans_for_each_update(trans, i)
- if (i->k->k.type != KEY_TYPE_accounting)
- bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-}
-
-static int bch2_trans_commit_journal_pin_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- return 0;
-}
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry **stopped_at,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- int ret = 0, u64s_delta = 0;
-
- for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
- struct btree_insert_entry *i = trans->updates + idx;
- if (i->cached)
- continue;
-
- u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= i->old_btree_u64s;
-
- if (!same_leaf_as_next(trans, i)) {
- if (u64s_delta <= 0) {
- ret = bch2_foreground_maybe_merge(trans, i->path,
- i->level, flags);
- if (unlikely(ret))
- return ret;
- }
-
- u64s_delta = 0;
- }
- }
-
- ret = bch2_trans_lock_write(trans);
- if (unlikely(ret))
- return ret;
-
- ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
- if (!ret && unlikely(trans->journal_replay_not_finished))
- bch2_drop_overwrites_from_journal(trans);
-
- bch2_trans_unlock_updates_write(trans);
-
- if (!ret && trans->journal_pin)
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin,
- bch2_trans_commit_journal_pin_flush);
-
- /*
- * Drop journal reservation after dropping write locks, since dropping
- * the journal reservation may kick off a journal write:
- */
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
- bch2_journal_res_put(&c->journal, &trans->journal_res);
-
- return ret;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
- int ret = bch2_journal_error(&c->journal) ?:
- bch2_btree_key_cache_wait_done(c);
-
- if (!ret)
- journal_reclaim_kick(&c->journal);
- return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry *i,
- int ret, unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-
- if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
- /*
- * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
- * flag
- */
- if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < BCH_WATERMARK_reclaim) {
- ret = bch_err_throw(c, journal_reclaim_would_deadlock);
- goto out;
- }
-
- ret = drop_locks_do(trans,
- bch2_trans_journal_res_get(trans,
- (flags & BCH_WATERMARK_MASK)|
- JOURNAL_RES_GET_CHECK));
- goto out;
- }
-
- switch (ret) {
- case -BCH_ERR_btree_insert_btree_node_full:
- ret = bch2_btree_split_leaf(trans, i->path, flags);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans,
- trace_ip, trans->paths + i->path);
- break;
- case -BCH_ERR_btree_insert_need_mark_replicas:
- ret = drop_locks_do(trans,
- bch2_accounting_update_sb(trans));
- break;
- case -BCH_ERR_btree_insert_need_journal_reclaim:
- bch2_trans_unlock(trans);
-
- trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
- track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
-
- wait_event_freezable(c->journal.reclaim_wait,
- (ret = journal_reclaim_wait_done(c)));
-
- track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
-
- if (ret < 0)
- break;
-
- ret = bch2_trans_relock(trans);
- break;
- default:
- BUG_ON(ret >= 0);
- break;
- }
-out:
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- (flags & BCH_TRANS_COMMIT_no_enospc), c,
- "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
- return ret;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
-
- BUG_ON(current != c->recovery_task);
-
- trans_for_each_update(trans, i) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
- if (ret)
- return ret;
- }
-
- for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
- i != btree_trans_journal_entries_top(trans);
- i = vstruct_next(i)) {
- if (i->type == BCH_JSET_ENTRY_btree_keys ||
- i->type == BCH_JSET_ENTRY_write_buffer_keys) {
- jset_entry_for_each_key(i, k) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
- if (ret)
- return ret;
- }
- }
-
- if (i->type == BCH_JSET_ENTRY_btree_root) {
- guard(mutex)(&c->btree_root_lock);
-
- struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
-
- bkey_copy(&r->key, i->start);
- r->level = i->level;
- r->alive = true;
- }
- }
-
- for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
- i != btree_trans_subbuf_top(trans, &trans->accounting);
- i = bkey_next(i)) {
- int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
- struct btree_insert_entry *errored_at = NULL;
- struct bch_fs *c = trans->c;
- unsigned journal_u64s = 0;
- int ret = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- ret = trans_maybe_inject_restart(trans, _RET_IP_);
- if (unlikely(ret))
- goto out_reset;
-
- if (!trans->nr_updates &&
- !trans->journal_entries.u64s &&
- !trans->accounting.u64s)
- goto out_reset;
-
- ret = bch2_trans_commit_run_triggers(trans);
- if (ret)
- goto out_reset;
-
- if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
- unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) {
- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
- ret = do_bch2_trans_commit_to_journal_replay(trans);
- else
- ret = bch_err_throw(c, erofs_trans_commit);
- goto out_reset;
- }
-
- EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
-
- journal_u64s = jset_u64s(trans->accounting.u64s);
- trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
- if (trans->journal_transaction_names)
- journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
- trans_for_each_update(trans, i) {
- struct btree_path *path = trans->paths + i->path;
-
- EBUG_ON(!path->should_be_locked);
-
- ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
- if (unlikely(ret))
- goto out;
-
- EBUG_ON(!btree_node_intent_locked(path, i->level));
-
- if (i->key_cache_already_flushed)
- continue;
-
- if (i->flags & BTREE_UPDATE_nojournal)
- continue;
-
- /* we're going to journal the key being updated: */
- journal_u64s += jset_u64s(i->k->k.u64s);
-
- /* and we're also going to log the overwrite: */
- if (trans->journal_transaction_names)
- journal_u64s += jset_u64s(i->old_k.u64s);
- }
-
- if (trans->extra_disk_res) {
- ret = bch2_disk_reservation_add(c, trans->disk_res,
- trans->extra_disk_res,
- (flags & BCH_TRANS_COMMIT_no_enospc)
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto err;
- }
-retry:
- errored_at = NULL;
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
- trans->journal_u64s = journal_u64s + trans->journal_entries.u64s;
-
- ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
-
- /* make sure we didn't drop or screw up locks: */
- bch2_trans_verify_locks(trans);
-
- if (ret)
- goto err;
-
- trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
- if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans);
-out_reset:
- if (!ret)
- bch2_trans_downgrade(trans);
- bch2_trans_reset_updates(trans);
-
- return ret;
-err:
- ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
- if (ret)
- goto out;
-
- /*
- * We might have done another transaction commit in the error path -
- * i.e. btree write buffer flush - which will have made use of
- * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
- * how the journal sequence number to pin is passed in - so we must
- * restart:
- */
- if (flags & BCH_TRANS_COMMIT_no_journal_res) {
- ret = bch_err_throw(c, transaction_restart_nested);
- goto out;
- }
-
- goto retry;
-}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
deleted file mode 100644
index 112170fd9c8f..000000000000
--- a/fs/bcachefs/btree_types.h
+++ /dev/null
@@ -1,937 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_TYPES_H
-#define _BCACHEFS_BTREE_TYPES_H
-
-#include <linux/list.h>
-#include <linux/rhashtable.h>
-
-#include "bbpos_types.h"
-#include "btree_key_cache_types.h"
-#include "buckets_types.h"
-#include "darray.h"
-#include "errcode.h"
-#include "journal_types.h"
-#include "replicas_types.h"
-#include "six.h"
-
-struct open_bucket;
-struct btree_update;
-struct btree_trans;
-
-#define MAX_BSETS 3U
-
-struct btree_nr_keys {
-
- /*
- * Amount of live metadata (i.e. size of node after a compaction) in
- * units of u64s
- */
- u16 live_u64s;
- u16 bset_u64s[MAX_BSETS];
-
- /* live keys only: */
- u16 packed_keys;
- u16 unpacked_keys;
-};
-
-struct bset_tree {
- /*
- * We construct a binary tree in an array as if the array
- * started at 1, so that things line up on the same cachelines
- * better: see comments in bset.c at cacheline_to_bkey() for
- * details
- */
-
- /* size of the binary tree and prev array */
- u16 size;
-
- /* function of size - precalculated for to_inorder() */
- u16 extra;
-
- u16 data_offset;
- u16 aux_data_offset;
- u16 end_offset;
-};
-
-struct btree_write {
- struct journal_entry_pin journal;
-};
-
-struct btree_alloc {
- struct open_buckets ob;
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-struct btree_bkey_cached_common {
- struct six_lock lock;
- u8 level;
- u8 btree_id;
- bool cached;
-};
-
-struct btree {
- struct btree_bkey_cached_common c;
-
- struct rhash_head hash;
- u64 hash_val;
-
- unsigned long flags;
- u16 written;
- u8 nsets;
- u8 nr_key_bits;
- u16 version_ondisk;
-
- struct bkey_format format;
-
- struct btree_node *data;
- void *aux_data;
-
- /*
- * Sets of sorted keys - the real btree node - plus a binary search tree
- *
- * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
- * to the memory we have allocated for this btree node. Additionally,
- * set[0]->data points to the entire btree node as it exists on disk.
- */
- struct bset_tree set[MAX_BSETS];
-
- struct btree_nr_keys nr;
- u16 sib_u64s[2];
- u16 whiteout_u64s;
- u8 byte_order;
- u8 unpack_fn_len;
-
- struct btree_write writes[2];
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
- /*
- * XXX: add a delete sequence number, so when bch2_btree_node_relock()
- * fails because the lock sequence number has changed - i.e. the
- * contents were modified - we can still relock the node if it's still
- * the one we want, without redoing the traversal
- */
-
- /*
- * For asynchronous splits/interior node updates:
- * When we do a split, we allocate new child nodes and update the parent
- * node to point to them: we update the parent in memory immediately,
- * but then we must wait until the children have been written out before
- * the update to the parent can be written - this is a list of the
- * btree_updates that are blocking this node from being
- * written:
- */
- struct list_head write_blocked;
-
- /*
- * Also for asynchronous splits/interior node updates:
- * If a btree node isn't reachable yet, we don't want to kick off
- * another write - because that write also won't yet be reachable and
- * marking it as completed before it's reachable would be incorrect:
- */
- unsigned long will_make_reachable;
-
- struct open_buckets ob;
-
- /* lru list */
- struct list_head list;
-};
-
-#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
- x(cache_reserve) \
- x(lock_intent) \
- x(lock_write) \
- x(dirty) \
- x(read_in_flight) \
- x(write_in_flight) \
- x(noevict) \
- x(write_blocked) \
- x(will_make_reachable) \
- x(access_bit)
-
-enum bch_btree_cache_not_freed_reasons {
-#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
- BCH_BTREE_CACHE_NOT_FREED_REASONS()
-#undef x
- BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
-};
-
-struct btree_cache_list {
- unsigned idx;
- struct shrinker *shrink;
- struct list_head list;
- size_t nr;
-};
-
-struct btree_cache {
- struct rhashtable table;
- bool table_init_done;
- /*
- * We never free a struct btree, except on shutdown - we just put it on
- * the btree_cache_freed list and reuse it later. This simplifies the
- * code, and it doesn't cost us much memory as the memory usage is
- * dominated by buffers that hold the actual btree node data and those
- * can be freed - and the number of struct btrees allocated is
- * effectively bounded.
- *
- * btree_cache_freeable effectively is a small cache - we use it because
- * high order page allocations can be rather expensive, and it's quite
- * common to delete and allocate btree nodes in quick succession. It
- * should never grow past ~2-3 nodes in practice.
- */
- struct mutex lock;
- struct list_head freeable;
- struct list_head freed_pcpu;
- struct list_head freed_nonpcpu;
- struct btree_cache_list live[2];
-
- size_t nr_freeable;
- size_t nr_reserve;
- size_t nr_by_btree[BTREE_ID_NR];
- atomic_long_t nr_dirty;
-
- /* shrinker stats */
- size_t nr_freed;
- u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
-
- /*
- * If we need to allocate memory for a new btree node and that
- * allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation - lock to guarantee only one thread does
- * this at a time:
- */
- struct task_struct *alloc_lock;
- struct closure_waitlist alloc_wait;
-
- struct bbpos pinned_nodes_start;
- struct bbpos pinned_nodes_end;
- /* btree id mask: 0 for leaves, 1 for interior */
- u64 pinned_nodes_mask[2];
-};
-
-struct btree_node_iter {
- struct btree_node_iter_set {
- u16 k, end;
- } data[MAX_BSETS];
-};
-
-#define BTREE_ITER_FLAGS() \
- x(slots) \
- x(intent) \
- x(prefetch) \
- x(is_extents) \
- x(not_extents) \
- x(cached) \
- x(with_key_cache) \
- x(with_updates) \
- x(with_journal) \
- x(snapshot_field) \
- x(all_snapshots) \
- x(filter_snapshots) \
- x(nopreserve) \
- x(cached_nofill) \
- x(key_cache_fill) \
-
-#define STR_HASH_FLAGS() \
- x(must_create) \
- x(must_replace)
-
-#define BTREE_UPDATE_FLAGS() \
- x(internal_snapshot_node) \
- x(nojournal) \
- x(key_cache_reclaim)
-
-
-/*
- * BTREE_TRIGGER_norun - don't run triggers at all
- *
- * BTREE_TRIGGER_transactional - we're running transactional triggers as part of
- * a transaction commit: triggers may generate new updates
- *
- * BTREE_TRIGGER_atomic - we're running atomic triggers during a transaction
- * commit: we have our journal reservation, we're holding btree node write
- * locks, and we know the transaction is going to commit (returning an error
- * here is a fatal error, causing us to go emergency read-only)
- *
- * BTREE_TRIGGER_gc - we're in gc/fsck: running triggers to recalculate e.g. disk usage
- *
- * BTREE_TRIGGER_insert - @new is entering the btree
- * BTREE_TRIGGER_overwrite - @old is leaving the btree
- */
-#define BTREE_TRIGGER_FLAGS() \
- x(norun) \
- x(transactional) \
- x(atomic) \
- x(check_repair) \
- x(gc) \
- x(insert) \
- x(overwrite) \
- x(is_root)
-
-enum {
-#define x(n) BTREE_ITER_FLAG_BIT_##n,
- BTREE_ITER_FLAGS()
- STR_HASH_FLAGS()
- BTREE_UPDATE_FLAGS()
- BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-/* iter flags must fit in a u16: */
-//BUILD_BUG_ON(BTREE_ITER_FLAG_BIT_key_cache_fill > 15);
-
-enum btree_iter_update_trigger_flags {
-#define x(n) BTREE_ITER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- BTREE_ITER_FLAGS()
-#undef x
-#define x(n) STR_HASH_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- STR_HASH_FLAGS()
-#undef x
-#define x(n) BTREE_UPDATE_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- BTREE_UPDATE_FLAGS()
-#undef x
-#define x(n) BTREE_TRIGGER_##n = 1U << BTREE_ITER_FLAG_BIT_##n,
- BTREE_TRIGGER_FLAGS()
-#undef x
-};
-
-enum btree_path_uptodate {
- BTREE_ITER_UPTODATE = 0,
- BTREE_ITER_NEED_RELOCK = 1,
- BTREE_ITER_NEED_TRAVERSE = 2,
-};
-
-#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
-#define TRACK_PATH_ALLOCATED
-#endif
-
-typedef u16 btree_path_idx_t;
-
-struct btree_path {
- btree_path_idx_t sorted_idx;
- u8 ref;
- u8 intent_ref;
-
- /* btree_iter_copy starts here: */
- struct bpos pos;
-
- enum btree_id btree_id:5;
- bool cached:1;
- bool preserve:1;
- enum btree_path_uptodate uptodate:2;
- /*
- * When true, failing to relock this path will cause the transaction to
- * restart:
- */
- bool should_be_locked:1;
- unsigned level:3,
- locks_want:3;
- u8 nodes_locked;
-
- struct btree_path_level {
- struct btree *b;
- struct btree_node_iter iter;
- u32 lock_seq;
-#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- u64 lock_taken_time;
-#endif
- } l[BTREE_MAX_DEPTH];
-#ifdef TRACK_PATH_ALLOCATED
- unsigned long ip_allocated;
-#endif
-};
-
-static inline struct btree_path_level *path_l(struct btree_path *path)
-{
- return path->l + path->level;
-}
-
-static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
-{
-#ifdef TRACK_PATH_ALLOCATED
- return path->ip_allocated;
-#else
- return _THIS_IP_;
-#endif
-}
-
-/*
- * @pos - iterator's current position
- * @level - current btree depth
- * @locks_want - btree level below which we start taking intent locks
- * @nodes_locked - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
- btree_path_idx_t path;
- btree_path_idx_t update_path;
- btree_path_idx_t key_cache_path;
-
- enum btree_id btree_id:8;
- u8 min_depth;
-
- /* btree_iter_copy starts here: */
- u16 flags;
-
- /* When we're filtering by snapshot, the snapshot ID we're looking for: */
- unsigned snapshot;
-
- struct bpos pos;
- /*
- * Current unpacked key - so that bch2_btree_iter_next()/
- * bch2_btree_iter_next_slot() can correctly advance pos.
- */
- struct bkey k;
-
- /* BTREE_ITER_with_journal: */
- size_t journal_idx;
-#ifdef TRACK_PATH_ALLOCATED
- unsigned long ip_allocated;
-#endif
-};
-
-#define BKEY_CACHED_ACCESSED 0
-#define BKEY_CACHED_DIRTY 1
-
-struct bkey_cached {
- struct btree_bkey_cached_common c;
-
- unsigned long flags;
- u16 u64s;
- struct bkey_cached_key key;
-
- struct rhash_head hash;
-
- struct journal_entry_pin journal;
- u64 seq;
-
- struct bkey_i *k;
- struct rcu_head rcu;
-};
-
-static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
-{
- return !b->cached
- ? container_of(b, struct btree, c)->key.k.p
- : container_of(b, struct bkey_cached, c)->key.pos;
-}
-
-struct btree_insert_entry {
- unsigned flags;
- u8 sort_order;
- u8 bkey_type;
- enum btree_id btree_id:8;
- u8 level:4;
- bool cached:1;
- bool insert_trigger_run:1;
- bool overwrite_trigger_run:1;
- bool key_cache_already_flushed:1;
- /*
- * @old_k may be a key from the journal; @old_btree_u64s always refers
- * to the size of the key being overwritten in the btree:
- */
- u8 old_btree_u64s;
- btree_path_idx_t path;
- struct bkey_i *k;
- /* key being overwritten: */
- struct bkey old_k;
- const struct bch_val *old_v;
- unsigned long ip_allocated;
-};
-
-/* Number of btree paths we preallocate, usually enough */
-#define BTREE_ITER_INITIAL 64
-/*
- * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
- * paths should run inside this limit, and if they don't it usually indicates a
- * bug (leaking/duplicated btree paths).
- *
- * exception: some fsck paths
- *
- * bugs with excessive path usage seem to have possibly been eliminated now, so
- * we might consider eliminating this (and btree_trans_too_many_iter()) at some
- * point.
- */
-#define BTREE_ITER_NORMAL_LIMIT 256
-/* never exceed limit */
-#define BTREE_ITER_MAX (1U << 10)
-
-struct btree_trans_commit_hook;
-typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
-
-struct btree_trans_commit_hook {
- btree_trans_commit_hook_fn *fn;
- struct btree_trans_commit_hook *next;
-};
-
-#define BTREE_TRANS_MEM_MAX (1U << 16)
-
-#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
-
-struct btree_trans_paths {
- unsigned long nr_paths;
- struct btree_path paths[];
-};
-
-struct trans_kmalloc_trace {
- unsigned long ip;
- size_t bytes;
-};
-typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace;
-
-struct btree_trans_subbuf {
- u16 base;
- u16 u64s;
- u16 size;;
-};
-
-struct btree_trans {
- struct bch_fs *c;
-
- unsigned long *paths_allocated;
- struct btree_path *paths;
- btree_path_idx_t *sorted;
- struct btree_insert_entry *updates;
-
- void *mem;
- unsigned mem_top;
- unsigned mem_bytes;
- unsigned realloc_bytes_required;
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- darray_trans_kmalloc_trace trans_kmalloc_trace;
-#endif
-
- btree_path_idx_t nr_sorted;
- btree_path_idx_t nr_paths;
- btree_path_idx_t nr_paths_max;
- btree_path_idx_t nr_updates;
- u8 fn_idx;
- u8 lock_must_abort;
- bool lock_may_not_fail:1;
- bool srcu_held:1;
- bool locked:1;
- bool pf_memalloc_nofs:1;
- bool write_locked:1;
- bool used_mempool:1;
- bool in_traverse_all:1;
- bool paths_sorted:1;
- bool memory_allocation_failure:1;
- bool journal_transaction_names:1;
- bool journal_replay_not_finished:1;
- bool notrace_relock_fail:1;
- enum bch_errcode restarted:16;
- u32 restart_count;
-#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
- u32 restart_count_this_trans;
-#endif
-
- u64 last_begin_time;
- unsigned long last_begin_ip;
- unsigned long last_restarted_ip;
-#ifdef CONFIG_BCACHEFS_DEBUG
- bch_stacktrace last_restarted_trace;
-#endif
- unsigned long last_unlock_ip;
- unsigned long srcu_lock_time;
-
- const char *fn;
- struct btree_bkey_cached_common *locking;
- struct six_lock_waiter locking_wait;
- int srcu_idx;
-
- /* update path: */
- struct btree_trans_subbuf journal_entries;
- struct btree_trans_subbuf accounting;
-
- struct btree_trans_commit_hook *hooks;
- struct journal_entry_pin *journal_pin;
-
- struct journal_res journal_res;
- u64 *journal_seq;
- struct disk_reservation *disk_res;
-
- struct bch_fs_usage_base fs_usage_delta;
-
- unsigned journal_u64s;
- unsigned extra_disk_res; /* XXX kill */
-
- __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
- /* Entries before this are zeroed out on every bch2_trans_get() call */
-
- struct list_head list;
- struct closure ref;
-
- unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
- struct btree_trans_paths trans_paths;
- struct btree_path _paths[BTREE_ITER_INITIAL];
- btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4];
- struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
-};
-
-static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
-{
- return trans->paths + iter->path;
-}
-
-static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
-{
- return iter->key_cache_path
- ? trans->paths + iter->key_cache_path
- : NULL;
-}
-
-#define BCH_BTREE_WRITE_TYPES() \
- x(initial, 0) \
- x(init_next_bset, 1) \
- x(cache_reclaim, 2) \
- x(journal_reclaim, 3) \
- x(interior, 4)
-
-enum btree_write_type {
-#define x(t, n) BTREE_WRITE_##t,
- BCH_BTREE_WRITE_TYPES()
-#undef x
- BTREE_WRITE_TYPE_NR,
-};
-
-#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
-#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
-
-#define BTREE_FLAGS() \
- x(read_in_flight) \
- x(read_error) \
- x(dirty) \
- x(need_write) \
- x(write_blocked) \
- x(will_make_reachable) \
- x(noevict) \
- x(write_idx) \
- x(accessed) \
- x(write_in_flight) \
- x(write_in_flight_inner) \
- x(just_written) \
- x(dying) \
- x(fake) \
- x(need_rewrite) \
- x(need_rewrite_error) \
- x(need_rewrite_degraded) \
- x(need_rewrite_ptr_written_zero) \
- x(never_write) \
- x(pinned)
-
-enum btree_flags {
- /* First bits for btree node write type */
- BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
-#define x(flag) BTREE_NODE_##flag,
- BTREE_FLAGS()
-#undef x
-};
-
-#define x(flag) \
-static inline bool btree_node_ ## flag(struct btree *b) \
-{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
- \
-static inline void set_btree_node_ ## flag(struct btree *b) \
-{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
- \
-static inline void clear_btree_node_ ## flag(struct btree *b) \
-{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-
-BTREE_FLAGS()
-#undef x
-
-#define BTREE_NODE_REWRITE_REASON() \
- x(none) \
- x(unknown) \
- x(error) \
- x(degraded) \
- x(ptr_written_zero)
-
-enum btree_node_rewrite_reason {
-#define x(n) BTREE_NODE_REWRITE_##n,
- BTREE_NODE_REWRITE_REASON()
-#undef x
-};
-
-static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b)
-{
- if (btree_node_need_rewrite_ptr_written_zero(b))
- return BTREE_NODE_REWRITE_ptr_written_zero;
- if (btree_node_need_rewrite_degraded(b))
- return BTREE_NODE_REWRITE_degraded;
- if (btree_node_need_rewrite_error(b))
- return BTREE_NODE_REWRITE_error;
- if (btree_node_need_rewrite(b))
- return BTREE_NODE_REWRITE_unknown;
- return BTREE_NODE_REWRITE_none;
-}
-
-static inline struct btree_write *btree_current_write(struct btree *b)
-{
- return b->writes + btree_node_write_idx(b);
-}
-
-static inline struct btree_write *btree_prev_write(struct btree *b)
-{
- return b->writes + (btree_node_write_idx(b) ^ 1);
-}
-
-static inline struct bset_tree *bset_tree_last(struct btree *b)
-{
- EBUG_ON(!b->nsets);
- return b->set + b->nsets - 1;
-}
-
-static inline void *
-__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
-{
- return (void *) ((u64 *) b->data + offset);
-}
-
-static inline u16
-__btree_node_ptr_to_offset(const struct btree *b, const void *p)
-{
- u16 ret = (u64 *) p - (u64 *) b->data;
-
- EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
- return ret;
-}
-
-static inline struct bset *bset(const struct btree *b,
- const struct bset_tree *t)
-{
- return __btree_node_offset_to_ptr(b, t->data_offset);
-}
-
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
- t->end_offset =
- __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
- const struct bset *i)
-{
- t->data_offset = __btree_node_ptr_to_offset(b, i);
- set_btree_bset_end(b, t);
-}
-
-static inline struct bset *btree_bset_first(struct btree *b)
-{
- return bset(b, b->set);
-}
-
-static inline struct bset *btree_bset_last(struct btree *b)
-{
- return bset(b, bset_tree_last(b));
-}
-
-static inline u16
-__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
-{
- return __btree_node_ptr_to_offset(b, k);
-}
-
-static inline struct bkey_packed *
-__btree_node_offset_to_key(const struct btree *b, u16 k)
-{
- return __btree_node_offset_to_ptr(b, k);
-}
-
-static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
-{
- return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
-}
-
-#define btree_bkey_first(_b, _t) \
-({ \
- EBUG_ON(bset(_b, _t)->start != \
- __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
- \
- bset(_b, _t)->start; \
-})
-
-#define btree_bkey_last(_b, _t) \
-({ \
- EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
- vstruct_last(bset(_b, _t))); \
- \
- __btree_node_offset_to_key(_b, (_t)->end_offset); \
-})
-
-static inline unsigned bset_u64s(struct bset_tree *t)
-{
- return t->end_offset - t->data_offset -
- sizeof(struct bset) / sizeof(u64);
-}
-
-static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
-{
- return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
-}
-
-static inline unsigned bset_byte_offset(struct btree *b, void *i)
-{
- return i - (void *) b->data;
-}
-
-enum btree_node_type {
- BKEY_TYPE_btree,
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
- BCH_BTREE_IDS()
-#undef x
- BKEY_TYPE_NR
-};
-
-/* Type of a key in btree @id at level @level: */
-static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
-{
- return level ? BKEY_TYPE_btree : (unsigned) id + 1;
-}
-
-/* Type of keys @b contains: */
-static inline enum btree_node_type btree_node_type(struct btree *b)
-{
- return __btree_node_type(b->c.level, b->c.btree_id);
-}
-
-const char *bch2_btree_node_type_str(enum btree_node_type);
-
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
- (BIT_ULL(BKEY_TYPE_extents)| \
- BIT_ULL(BKEY_TYPE_alloc)| \
- BIT_ULL(BKEY_TYPE_inodes)| \
- BIT_ULL(BKEY_TYPE_stripes)| \
- BIT_ULL(BKEY_TYPE_reflink)| \
- BIT_ULL(BKEY_TYPE_subvolumes)| \
- BIT_ULL(BKEY_TYPE_btree))
-
-#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
- (BIT_ULL(BKEY_TYPE_alloc)| \
- BIT_ULL(BKEY_TYPE_inodes)| \
- BIT_ULL(BKEY_TYPE_stripes)| \
- BIT_ULL(BKEY_TYPE_snapshots))
-
-#define BTREE_NODE_TYPE_HAS_TRIGGERS \
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-
-static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
-{
- return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
-{
- return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
-}
-
-static inline bool btree_node_type_has_triggers(enum btree_node_type type)
-{
- return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
-}
-
-static inline bool btree_id_is_extents(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
- return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
-}
-
-static inline bool btree_type_has_snapshots(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_snapshot_field(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_has_ptrs(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline bool btree_type_uses_write_buffer(enum btree_id btree)
-{
- const u64 mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr)
- BCH_BTREE_IDS()
-#undef x
- ;
-
- return BIT_ULL(btree) & mask;
-}
-
-static inline u8 btree_trigger_order(enum btree_id btree)
-{
- switch (btree) {
- case BTREE_ID_alloc:
- return U8_MAX;
- case BTREE_ID_stripes:
- return U8_MAX - 1;
- default:
- return btree;
- }
-}
-
-struct btree_root {
- struct btree *b;
-
- /* On disk root - see async splits: */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
- u8 level;
- u8 alive;
- s16 error;
-};
-
-enum btree_gc_coalesce_fail_reason {
- BTREE_GC_COALESCE_FAIL_RESERVE_GET,
- BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
- BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
-};
-
-enum btree_node_sibling {
- btree_prev_sib,
- btree_next_sib,
-};
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
-#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
deleted file mode 100644
index ee657b9f4b96..000000000000
--- a/fs/bcachefs/btree_update.c
+++ /dev/null
@@ -1,916 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_iter.h"
-#include "btree_journal_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extents.h"
-#include "keylist.h"
-#include "snapshot.h"
-#include "trace.h"
-
-#include <linux/string_helpers.h>
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
- const struct btree_insert_entry *r)
-{
- return cmp_int(l->sort_order, r->sort_order) ?:
- cmp_int(l->cached, r->cached) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p);
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
- struct bkey_i *, enum btree_iter_update_trigger_flags,
- unsigned long ip);
-
-static noinline int extent_front_merge(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bkey_i **insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *update;
- int ret;
-
- if (unlikely(trans->journal_replay_not_finished))
- return 0;
-
- update = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- return ret;
-
- if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
- return 0;
-
- ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
- bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- ret = bch2_btree_delete_at(trans, iter, flags);
- if (ret)
- return ret;
-
- *insert = update;
- return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- if (unlikely(trans->journal_replay_not_finished))
- return 0;
-
- ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
- bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- bch2_bkey_merge(c, bkey_i_to_s(insert), k);
- return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot = pos.snapshot;
- int ret;
-
- if (!bch2_snapshot_parent(trans->c, pos.snapshot))
- return 0;
-
- pos.snapshot++;
-
- for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_nopreserve, k, ret) {
- if (!bkey_eq(k.k->p, pos))
- break;
-
- if (bch2_snapshot_is_ancestor(trans->c, snapshot,
- k.k->p.snapshot)) {
- ret = !bkey_whiteout(k.k);
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos,
- snapshot_id_list *s)
-{
- int ret = 0;
-
- darray_for_each(*s, id) {
- pos.snapshot = *id;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bkey_err(k);
- if (ret)
- break;
-
- if (k.k->type == KEY_TYPE_deleted) {
- struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
- ret = PTR_ERR_OR_ZERO(update);
- if (ret) {
- bch2_trans_iter_exit(trans, &iter);
- break;
- }
-
- bkey_init(&update->k);
- update->k.p = pos;
- update->k.type = KEY_TYPE_whiteout;
-
- ret = bch2_trans_update(trans, &iter, update,
- BTREE_UPDATE_internal_snapshot_node);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- break;
- }
-
- darray_exit(s);
- return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_iter_update_trigger_flags flags,
- struct bkey_s_c old,
- struct bkey_s_c new)
-{
- enum btree_id btree_id = iter->btree_id;
- struct bkey_i *update;
- struct bpos new_start = bkey_start_pos(new.k);
- unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
- unsigned back_split = bkey_gt(old.k->p, new.k->p);
- unsigned middle_split = (front_split || back_split) &&
- old.k->p.snapshot != new.k->p.snapshot;
- unsigned nr_splits = front_split + back_split + middle_split;
- int ret = 0, compressed_sectors;
-
- /*
- * If we're going to be splitting a compressed extent, note it
- * so that __bch2_trans_commit() can increase our disk
- * reservation:
- */
- if (nr_splits > 1 &&
- (compressed_sectors = bch2_bkey_sectors_compressed(old)))
- trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
-
- if (front_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_back(new_start, update);
-
- ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
- old.k->p, update->k.p) ?:
- bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_internal_snapshot_node|flags);
- if (ret)
- return ret;
- }
-
- /* If we're overwriting in a different snapshot - middle split: */
- if (middle_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_front(new_start, update);
- bch2_cut_back(new.k->p, update);
-
- ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
- old.k->p, update->k.p) ?:
- bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_internal_snapshot_node|flags);
- if (ret)
- return ret;
- }
-
- if (bkey_le(old.k->p, new.k->p)) {
- update = bch2_trans_kmalloc(trans, sizeof(*update));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bkey_init(&update->k);
- update->k.p = old.k->p;
- update->k.p.snapshot = new.k->p.snapshot;
-
- if (new.k->p.snapshot != old.k->p.snapshot) {
- update->k.type = KEY_TYPE_whiteout;
- } else if (btree_type_has_snapshots(btree_id)) {
- ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
- if (ret < 0)
- return ret;
- if (ret)
- update->k.type = KEY_TYPE_whiteout;
- }
-
- ret = bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_internal_snapshot_node|flags);
- if (ret)
- return ret;
- }
-
- if (back_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_front(new.k->p, update);
-
- ret = bch2_trans_update_by_path(trans, iter->path, update,
- BTREE_UPDATE_internal_snapshot_node|
- flags, _RET_IP_);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
- struct btree_iter *orig_iter,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- enum btree_id btree_id = orig_iter->btree_id;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
- BTREE_ITER_intent|
- BTREE_ITER_with_updates|
- BTREE_ITER_not_extents);
- k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
- if ((ret = bkey_err(k)))
- goto err;
- if (!k.k)
- goto out;
-
- if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
- if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
- ret = extent_front_merge(trans, &iter, k, &insert, flags);
- if (ret)
- goto err;
- }
-
- goto next;
- }
-
- while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
- bool done = bkey_lt(insert->k.p, k.k->p);
-
- ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
- if (ret)
- goto err;
-
- if (done)
- goto out;
-next:
- bch2_btree_iter_advance(trans, &iter);
- k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
- if ((ret = bkey_err(k)))
- goto err;
- if (!k.k)
- goto out;
- }
-
- if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
- ret = extent_back_merge(trans, &iter, insert, k);
- if (ret)
- goto err;
- }
-out:
- if (!bkey_deleted(&insert->k))
- ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
- struct btree_insert_entry *i,
- enum btree_iter_update_trigger_flags flags,
- unsigned long ip)
-{
- struct bkey k;
- int ret;
-
- btree_path_idx_t path_idx =
- bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
- BTREE_ITER_intent, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, path_idx, 0);
- if (ret)
- goto out;
-
- struct btree_path *btree_path = trans->paths + path_idx;
-
- /*
- * The old key in the insert entry might actually refer to an existing
- * key in the btree that has been deleted from cache and not yet
- * flushed. Check for this and skip the flush so we don't run triggers
- * against a stale key.
- */
- bch2_btree_path_peek_slot_exact(btree_path, &k);
- if (!bkey_deleted(&k))
- goto out;
-
- i->key_cache_already_flushed = true;
- i->flags |= BTREE_TRIGGER_norun;
-
- btree_path_set_should_be_locked(trans, btree_path);
- ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
-out:
- bch2_path_put(trans, path_idx, true);
- return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
- unsigned long ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i, n;
- int cmp;
-
- struct btree_path *path = trans->paths + path_idx;
- EBUG_ON(!path->should_be_locked);
- EBUG_ON(trans->nr_updates >= trans->nr_paths);
- EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
- n = (struct btree_insert_entry) {
- .flags = flags,
- .sort_order = btree_trigger_order(path->btree_id),
- .bkey_type = __btree_node_type(path->level, path->btree_id),
- .btree_id = path->btree_id,
- .level = path->level,
- .cached = path->cached,
- .path = path_idx,
- .k = k,
- .ip_allocated = ip,
- };
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(i != trans->updates &&
- btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
- /*
- * Pending updates are kept sorted: first, find position of new update,
- * then delete/trim any updates the new update overwrites:
- */
- for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
- cmp = btree_insert_entry_cmp(&n, i);
- if (cmp <= 0)
- break;
- }
-
- bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
-
- if (overwrite) {
- EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
- bch2_path_put(trans, i->path, true);
- i->flags = n.flags;
- i->cached = n.cached;
- i->k = n.k;
- i->path = n.path;
- i->ip_allocated = n.ip_allocated;
- } else {
- array_insert_item(trans->updates, trans->nr_updates,
- i - trans->updates, n);
-
- i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
- i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
- if (j_k) {
- i->old_k = j_k->k;
- i->old_v = &j_k->v;
- }
- }
- }
-
- __btree_path_get(trans, trans->paths + i->path, true);
-
- trace_update_by_path(trans, path, i, overwrite);
-
- /*
- * If a key is present in the key cache, it must also exist in the
- * btree - this is necessary for cache coherency. When iterating over
- * a btree that's cached in the key cache, the btree iter code checks
- * the key cache - but the key has to exist in the btree for that to
- * work:
- */
- if (path->cached && !i->old_btree_u64s)
- return flush_new_cached_update(trans, i, flags, ip);
-
- return 0;
-}
-
-static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_path *path)
-{
- struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
-
- if (!key_cache_path ||
- !key_cache_path->should_be_locked ||
- !bpos_eq(key_cache_path->pos, iter->pos)) {
- struct bkey_cached *ck;
- int ret;
-
- if (!iter->key_cache_path)
- iter->key_cache_path =
- bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_intent|
- BTREE_ITER_cached, _THIS_IP_);
-
- iter->key_cache_path =
- bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
-
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_cached);
- if (unlikely(ret))
- return ret;
-
- ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
- }
-
- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
- }
-
- return 0;
-}
-
-int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
- unsigned long ip)
-{
- kmsan_check_memory(k, bkey_bytes(&k->k));
-
- btree_path_idx_t path_idx = iter->update_path ?: iter->path;
- int ret;
-
- if (iter->flags & BTREE_ITER_is_extents)
- return bch2_trans_update_extent(trans, iter, k, flags);
-
- if (bkey_deleted(&k->k) &&
- !(flags & BTREE_UPDATE_key_cache_reclaim) &&
- (iter->flags & BTREE_ITER_filter_snapshots)) {
- ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
- if (unlikely(ret < 0))
- return ret;
-
- if (ret)
- k->k.type = KEY_TYPE_whiteout;
- }
-
- /*
- * Ensure that updates to cached btrees go to the key cache:
- */
- struct btree_path *path = trans->paths + path_idx;
- if (!(flags & BTREE_UPDATE_key_cache_reclaim) &&
- !path->cached &&
- !path->level &&
- btree_id_cached(trans->c, path->btree_id)) {
- ret = bch2_trans_update_get_key_cache(trans, iter, path);
- if (ret)
- return ret;
-
- path_idx = iter->key_cache_path;
- }
-
- return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
-{
- struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- bkey_copy(n, k);
- return bch2_btree_insert_trans(trans, btree, n, 0);
-}
-
-void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
- struct btree_trans_subbuf *buf,
- unsigned u64s)
-{
- unsigned new_top = buf->u64s + u64s;
- unsigned new_size = buf->size;
-
- BUG_ON(roundup_pow_of_two(new_top) > U16_MAX);
-
- if (new_top > new_size)
- new_size = roundup_pow_of_two(new_top);
-
- void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64));
- if (IS_ERR(n))
- return n;
-
- unsigned offset = (u64 *) n - (u64 *) trans->mem;
- BUG_ON(offset > U16_MAX);
-
- if (buf->u64s)
- memcpy(n,
- btree_trans_subbuf_base(trans, buf),
- buf->size * sizeof(u64));
- buf->base = (u64 *) n - (u64 *) trans->mem;
- buf->size = new_size;
-
- void *p = btree_trans_subbuf_top(trans, buf);
- buf->u64s = new_top;
- return p;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_id btree, struct bpos end)
-{
- bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- bch2_btree_iter_advance(trans, iter);
- k = bch2_btree_iter_peek_slot(trans, iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- BUG_ON(k.k->type != KEY_TYPE_deleted);
-
- if (bkey_gt(k.k->p, end)) {
- ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
- goto err;
- }
-
- return 0;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *h)
-{
- h->next = trans->hooks;
- trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
- enum btree_id btree, struct bkey_i *k,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, btree, k->k.p,
- BTREE_ITER_cached|
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
- BTREE_ITER_intent|flags);
- int ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c: pointer to struct bch_fs
- * @id: btree to insert into
- * @k: key to insert
- * @disk_res: must be non-NULL whenever inserting or potentially
- * splitting data extents
- * @flags: transaction commit flags
- * @iter_flags: btree iter update trigger flags
- *
- * Returns: 0 on success, error code on failure
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
- struct disk_reservation *disk_res, int flags,
- enum btree_iter_update_trigger_flags iter_flags)
-{
- return bch2_trans_commit_do(c, disk_res, NULL, flags,
- bch2_btree_insert_trans(trans, id, k, iter_flags));
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
-{
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
- int ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- return ret;
-
- bkey_init(&k->k);
- k->k.p = iter->pos;
- return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos,
- unsigned update_flags)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, btree, pos,
- BTREE_ITER_cached|
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_btree_delete_at(trans, &iter, update_flags);
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
- struct bpos start, struct bpos end,
- unsigned update_flags,
- u64 *journal_seq)
-{
- u32 restart_count = trans->restart_count;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
- while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(trans->c, 0);
- struct bkey_i delete;
-
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bkey_init(&delete.k);
-
- /*
- * This could probably be more efficient for extents:
- */
-
- /*
- * For extents, iter.pos won't necessarily be the same as
- * bkey_start_pos(k.k) (for non extents they always will be the
- * same). It's important that we delete starting from iter.pos
- * because the range we want to delete could start in the middle
- * of k.
- *
- * (bch2_btree_iter_peek() does guarantee that iter.pos >=
- * bkey_start_pos(k.k)).
- */
- delete.k.p = iter.pos;
-
- if (iter.flags & BTREE_ITER_is_extents)
- bch2_key_resize(&delete.k,
- bpos_min(end, k.k->p).offset -
- iter.pos.offset);
-
- ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
- bch2_trans_commit(trans, &disk_res, journal_seq,
- BCH_TRANS_COMMIT_no_enospc);
- bch2_disk_reservation_put(trans->c, &disk_res);
-err:
- /*
- * the bch2_trans_begin() call is in a weird place because we
- * need to call it after every transaction commit, to avoid path
- * overflow, but don't want to call it if the delete operation
- * is a no-op and we have no work to do:
- */
- bch2_trans_begin(trans);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
- struct bpos start, struct bpos end,
- unsigned update_flags,
- u64 *journal_seq)
-{
- int ret = bch2_trans_run(c,
- bch2_btree_delete_range_trans(trans, id, start, end,
- update_flags, journal_seq));
- if (ret == -BCH_ERR_transaction_restart_nested)
- ret = 0;
- return ret;
-}
-
-int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
-{
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
- int ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- return ret;
-
- bkey_init(&k->k);
- k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = iter->pos;
- if (iter->flags & BTREE_ITER_is_extents)
- bch2_key_resize(&k->k, 1);
-
- return bch2_trans_update(trans, iter, k, 0);
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos, bool set)
-{
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-
- int ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_btree_bit_mod_iter(trans, &iter, set);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos, bool set)
-{
- struct bkey_i k;
-
- bkey_init(&k.k);
- k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k.k.p = pos;
-
- return bch2_trans_update_buffered(trans, btree, &k);
-}
-
-static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len)
-{
- unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
-
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- int ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
- journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0);
- return 0;
-}
-
-int bch2_trans_log_str(struct btree_trans *trans, const char *str)
-{
- return __bch2_trans_log_str(trans, str, strlen(str));
-}
-
-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
-{
- int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- return ret;
-
- return __bch2_trans_log_str(trans, buf->buf, buf->pos);
-}
-
-int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree,
- unsigned level, struct bkey_i *k)
-{
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
- int ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s);
- bkey_copy(e->start, k);
- return 0;
-}
-
-__printf(3, 0)
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
- va_list args)
-{
- struct printbuf buf = PRINTBUF;
- prt_vprintf(&buf, fmt, args);
-
- unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
- int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- goto err;
-
- if (!test_bit(JOURNAL_running, &c->journal.flags)) {
- ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
- if (ret)
- goto err;
-
- struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
- journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0);
- c->journal.early_journal_entries.nr += jset_u64s(u64s);
- } else {
- ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
- bch2_trans_log_msg(trans, &buf));
- }
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-__printf(2, 3)
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = __bch2_fs_log_msg(c, 0, fmt, args);
- va_end(args);
- return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-__printf(2, 3)
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
- va_end(args);
- return ret;
-}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
deleted file mode 100644
index 0b98ab959719..000000000000
--- a/fs/bcachefs/btree_update.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_H
-#define _BCACHEFS_BTREE_UPDATE_H
-
-#include "btree_iter.h"
-#include "journal.h"
-#include "snapshot.h"
-
-struct bch_fs;
-struct btree;
-
-void bch2_btree_node_prep_for_write(struct btree_trans *,
- struct btree_path *, struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
- struct btree *, struct btree_node_iter *,
- struct bkey_i *);
-
-int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
-int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
-void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
-
-void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
- struct bkey_i *, u64);
-
-#define BCH_TRANS_COMMIT_FLAGS() \
- x(no_enospc, "don't check for enospc") \
- x(no_check_rw, "don't attempt to take a ref on c->writes") \
- x(no_journal_res, "don't take a journal reservation, instead " \
- "pin journal entry referred to by trans->journal_res.seq") \
- x(journal_reclaim, "operation required for journal reclaim; may return error" \
- "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
- x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
-
-enum __bch_trans_commit_flags {
- /* First bits for bch_watermark: */
- __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
-#define x(n, ...) __BCH_TRANS_COMMIT_##n,
- BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-enum bch_trans_commit_flags {
-#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
- BCH_TRANS_COMMIT_FLAGS()
-#undef x
-};
-
-void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
-
-int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
-
-int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
- struct bkey_i *, enum btree_iter_update_trigger_flags);
-
-int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
- enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
- disk_reservation *, int flags, enum
- btree_iter_update_trigger_flags iter_flags);
-
-int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos, unsigned, u64 *);
-int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
- struct bpos, struct bpos, unsigned, u64 *);
-
-int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
-int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
-int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
-
-static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos)
-{
- return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
- struct bpos, snapshot_id_list *);
-
-/*
- * For use when splitting extents in existing snapshots:
- *
- * If @old_pos is an interior snapshot node, iterate over descendent snapshot
- * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
- * not visible, emit a whiteout at @new_pos.
- */
-static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id btree,
- struct bpos old_pos,
- struct bpos new_pos)
-{
- BUG_ON(old_pos.snapshot != new_pos.snapshot);
-
- if (!btree_type_has_snapshots(btree) ||
- bkey_eq(old_pos, new_pos))
- return 0;
-
- snapshot_id_list s;
- int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
- if (ret)
- return ret;
-
- return s.nr
- ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
- : 0;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
- enum btree_iter_update_trigger_flags,
- struct bkey_s_c, struct bkey_s_c);
-
-int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
- enum btree_id, struct bpos);
-
-int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_iter_update_trigger_flags,
- unsigned long);
-
-static inline int __must_check
-bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
-{
- return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
-}
-
-static inline void *btree_trans_subbuf_base(struct btree_trans *trans,
- struct btree_trans_subbuf *buf)
-{
- return (u64 *) trans->mem + buf->base;
-}
-
-static inline void *btree_trans_subbuf_top(struct btree_trans *trans,
- struct btree_trans_subbuf *buf)
-{
- return (u64 *) trans->mem + buf->base + buf->u64s;
-}
-
-void *__bch2_trans_subbuf_alloc(struct btree_trans *,
- struct btree_trans_subbuf *,
- unsigned);
-
-static inline void *
-bch2_trans_subbuf_alloc(struct btree_trans *trans,
- struct btree_trans_subbuf *buf,
- unsigned u64s)
-{
- if (buf->u64s + u64s > buf->size)
- return __bch2_trans_subbuf_alloc(trans, buf, u64s);
-
- void *p = btree_trans_subbuf_top(trans, buf);
- buf->u64s += u64s;
- return p;
-}
-
-static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans)
-{
- return btree_trans_subbuf_base(trans, &trans->journal_entries);
-}
-
-static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
-{
- return btree_trans_subbuf_top(trans, &trans->journal_entries);
-}
-
-static inline struct jset_entry *
-bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
-{
- return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s);
-}
-
-int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
-
-int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *);
-
-static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
-{
- kmsan_check_memory(k, bkey_bytes(&k->k));
-
- EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
- if (unlikely(!btree_type_uses_write_buffer(btree))) {
- int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k);
- dump_stack();
- return ret;
- }
- /*
- * Most updates skip the btree write buffer until journal replay is
- * finished because synchronization with journal replay relies on having
- * a btree node locked - if we're overwriting a key in the journal that
- * journal replay hasn't yet replayed, we have to mark it as
- * overwritten.
- *
- * But accounting updates don't overwrite, they're deltas, and they have
- * to be flushed to the btree strictly in order for journal replay to be
- * able to tell which updates need to be applied:
- */
- if (k->k.type != KEY_TYPE_accounting &&
- unlikely(trans->journal_replay_not_finished))
- return bch2_btree_insert_clone_trans(trans, btree, k);
-
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
- int ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
- bkey_copy(e->start, k);
- return 0;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *,
- struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *, unsigned);
-
-int bch2_trans_log_str(struct btree_trans *, const char *);
-int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
-int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *);
-
-__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
-
-/**
- * bch2_trans_commit - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static inline int bch2_trans_commit(struct btree_trans *trans,
- struct disk_reservation *disk_res,
- u64 *journal_seq,
- unsigned flags)
-{
- trans->disk_res = disk_res;
- trans->journal_seq = journal_seq;
-
- return __bch2_trans_commit(trans, flags);
-}
-
-#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
- lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_flags)))
-
-#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
- nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_flags)))
-
-#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \
- bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
-
-#define trans_for_each_update(_trans, _i) \
- for (struct btree_insert_entry *_i = (_trans)->updates; \
- (_i) < (_trans)->updates + (_trans)->nr_updates; \
- (_i)++)
-
-static inline void bch2_trans_reset_updates(struct btree_trans *trans)
-{
- trans_for_each_update(trans, i)
- bch2_path_put(trans, i->path, true);
-
- trans->nr_updates = 0;
- trans->journal_entries.u64s = 0;
- trans->journal_entries.size = 0;
- trans->accounting.u64s = 0;
- trans->accounting.size = 0;
- trans->hooks = NULL;
- trans->extra_disk_res = 0;
-}
-
-static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
- unsigned type, unsigned min_bytes)
-{
- unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
- struct bkey_i *mut;
-
- if (type && k.k->type != type)
- return ERR_PTR(-ENOENT);
-
- /* extra padding for varint_decode_fast... */
- mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
- if (!IS_ERR(mut)) {
- bkey_reassemble(mut, k);
-
- if (unlikely(bytes > bkey_bytes(k.k))) {
- memset((void *) mut + bkey_bytes(k.k), 0,
- bytes - bkey_bytes(k.k));
- mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
- }
- }
- return mut;
-}
-
-static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
-{
- return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
-}
-
-#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \
- bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \
- KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned min_bytes)
-{
- struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
- int ret;
-
- if (IS_ERR(mut))
- return mut;
-
- ret = bch2_trans_update(trans, iter, mut, flags);
- if (ret)
- return ERR_PTR(ret);
-
- *k = bkey_i_to_s_c(mut);
- return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
- struct btree_iter *iter, struct bkey_s_c *k,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
-}
-
-#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \
- bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
- KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned min_bytes)
-{
- struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
- btree_id, pos, flags|BTREE_ITER_intent, type);
- struct bkey_i *ret = IS_ERR(k.k)
- ? ERR_CAST(k.k)
- : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
- if (IS_ERR(ret))
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned min_bytes)
-{
- struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
- btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
- int ret;
-
- if (IS_ERR(mut))
- return mut;
-
- ret = bch2_trans_update(trans, iter, mut, flags);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ERR_PTR(ret);
- }
-
- return mut;
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags,
- unsigned min_bytes)
-{
- return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
-}
-
-static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
-}
-
-#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
- bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \
- _btree_id, _pos, _flags, \
- KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
-
-static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_iter_update_trigger_flags flags,
- unsigned type, unsigned val_size)
-{
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
- int ret;
-
- if (IS_ERR(k))
- return k;
-
- bkey_init(&k->k);
- k->k.p = iter->pos;
- k->k.type = type;
- set_bkey_val_bytes(&k->k, val_size);
-
- ret = bch2_trans_update(trans, iter, k, flags);
- if (unlikely(ret))
- return ERR_PTR(ret);
- return k;
-}
-
-#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \
- bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \
- KEY_TYPE_##_type, sizeof(struct bch_##_type)))
-
-#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
deleted file mode 100644
index 553059b33bfd..000000000000
--- a/fs/bcachefs/btree_update_interior.c
+++ /dev/null
@@ -1,2854 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "clock.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/random.h>
-
-static const char * const bch2_btree_update_modes[] = {
-#define x(t) #t,
- BTREE_UPDATE_MODES()
-#undef x
- NULL
-};
-
-static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *);
-
-static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- btree_path_idx_t, struct btree *, struct keylist *);
-static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-/*
- * Verify that child nodes correctly span parent node's range:
- */
-int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
- struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
- ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
- : b->data->min_key;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- struct bkey_buf prev;
- int ret = 0;
-
- BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
- b->data->min_key));
-
- bch2_bkey_buf_init(&prev);
- bkey_init(&prev.k->k);
- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
-
- if (b == btree_node_root(c, b)) {
- if (!bpos_eq(b->data->min_key, POS_MIN)) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "btree root with incorrect min_key: ");
- bch2_bpos_to_text(&buf, b->data->min_key);
- prt_newline(&buf);
-
- bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
- goto err;
- }
-
- if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "btree root with incorrect max_key: ");
- bch2_bpos_to_text(&buf, b->data->max_key);
- prt_newline(&buf);
-
- bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
- goto err;
- }
- }
-
- if (!b->c.level)
- goto out;
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- if (k.k->type != KEY_TYPE_btree_ptr_v2)
- goto out;
-
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
- struct bpos expected_min = bkey_deleted(&prev.k->k)
- ? node_min
- : bpos_successor(prev.k->k.p);
-
- if (!bpos_eq(expected_min, bp.v->min_key)) {
- prt_str(&buf, "end of prev node doesn't match start of next node");
- prt_str(&buf, "\nprev ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
- prt_str(&buf, "\nnext ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
- goto err;
- }
-
- bch2_bkey_buf_reassemble(&prev, c, k);
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- if (bkey_deleted(&prev.k->k)) {
- prt_printf(&buf, "empty interior node\n");
- bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
- goto err;
- }
-
- if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
- prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
- prt_newline(&buf);
-
- bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
- goto err;
- }
-out:
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&prev, c);
- printbuf_exit(&buf);
- return ret;
-err:
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_newline(&buf);
-
- ret = __bch2_topology_error(c, &buf);
- bch2_print_str(c, KERN_ERR, buf.buf);
- BUG_ON(!ret);
- goto out;
-}
-
-/* Calculate ideal packed bkey format for new btree nodes: */
-
-static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
-{
- struct bkey_packed *k;
- struct bkey uk;
-
- for_each_bset(b, t)
- bset_tree_for_each_key(b, t, k)
- if (!bkey_deleted(k)) {
- uk = bkey_unpack_key(b, k);
- bch2_bkey_format_add_key(s, &uk);
- }
-}
-
-static struct bkey_format bch2_btree_calc_format(struct btree *b)
-{
- struct bkey_format_state s;
-
- bch2_bkey_format_init(&s);
- bch2_bkey_format_add_pos(&s, b->data->min_key);
- bch2_bkey_format_add_pos(&s, b->data->max_key);
- __bch2_btree_calc_format(&s, b);
-
- return bch2_bkey_format_done(&s);
-}
-
-static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
- struct bkey_format *old_f,
- struct bkey_format *new_f)
-{
- /* stupid integer promotion rules */
- ssize_t delta =
- (((int) new_f->key_u64s - old_f->key_u64s) *
- (int) nr.packed_keys) +
- (((int) new_f->key_u64s - BKEY_U64s) *
- (int) nr.unpacked_keys);
-
- BUG_ON(delta + nr.live_u64s < 0);
-
- return nr.live_u64s + delta;
-}
-
-/**
- * bch2_btree_node_format_fits - check if we could rewrite node with a new format
- *
- * @c: filesystem handle
- * @b: btree node to rewrite
- * @nr: number of keys for new node (i.e. b->nr)
- * @new_f: bkey format to translate keys to
- *
- * Returns: true if all re-packed keys will be able to fit in a new node.
- *
- * Assumes all keys will successfully pack with the new format.
- */
-static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
- struct btree_nr_keys nr,
- struct bkey_format *new_f)
-{
- size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
-
- return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
-}
-
-/* Btree node freeing/allocation: */
-
-static void __btree_node_free(struct btree_trans *trans, struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- trace_and_count(c, btree_node_free, trans, b);
-
- BUG_ON(btree_node_write_blocked(b));
- BUG_ON(btree_node_dirty(b));
- BUG_ON(btree_node_need_write(b));
- BUG_ON(b == btree_node_root(c, b));
- BUG_ON(b->ob.nr);
- BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON(b->will_make_reachable);
-
- clear_btree_node_noevict(b);
-}
-
-static void bch2_btree_node_free_inmem(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
- __btree_node_free(trans, b);
-
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- six_unlock_write(&b->c.lock);
- mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
-
- bch2_trans_node_drop(trans, b);
-}
-
-static void bch2_btree_node_free_never_used(struct btree_update *as,
- struct btree_trans *trans,
- struct btree *b)
-{
- struct bch_fs *c = as->c;
- struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
-
- BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
-
- b->will_make_reachable = 0;
- closure_put(&as->cl);
-
- clear_btree_node_will_make_reachable(b);
- clear_btree_node_accessed(b);
- clear_btree_node_dirty_acct(c, b);
- clear_btree_node_need_write(b);
-
- mutex_lock(&c->btree_cache.lock);
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
-
- BUG_ON(p->nr >= ARRAY_SIZE(p->b));
- p->b[p->nr++] = b;
-
- six_unlock_intent(&b->c.lock);
-
- bch2_trans_node_drop(trans, b);
-}
-
-static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
- struct disk_reservation *res,
- struct closure *cl,
- bool interior_node,
- unsigned target,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct write_point *wp;
- struct btree *b;
- BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct open_buckets obs = { .nr = 0 };
- struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
- ? BTREE_NODE_RESERVE
- : 0;
- int ret;
-
- b = bch2_btree_node_mem_alloc(trans, interior_node);
- if (IS_ERR(b))
- return b;
-
- BUG_ON(b->ob.nr);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- if (c->btree_reserve_cache_nr > nr_reserve) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- obs = a->ob;
- bkey_copy(&tmp.k, &a->k);
- mutex_unlock(&c->btree_reserve_cache_lock);
- goto out;
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-retry:
- ret = bch2_alloc_sectors_start_trans(trans,
- target ?:
- c->opts.metadata_target ?:
- c->opts.foreground_target,
- 0,
- writepoint_ptr(&c->btree_write_point),
- &devs_have,
- res->nr_replicas,
- min(res->nr_replicas,
- c->opts.metadata_replicas_required),
- watermark,
- target ? BCH_WRITE_only_specified_devs : 0,
- cl, &wp);
- if (unlikely(ret))
- goto err;
-
- if (wp->sectors_free < btree_sectors(c)) {
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (ob->sectors_free < btree_sectors(c))
- ob->sectors_free = 0;
-
- bch2_alloc_sectors_done(c, wp);
- goto retry;
- }
-
- bkey_btree_ptr_v2_init(&tmp.k);
- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
-
- bch2_open_bucket_get(c, wp, &obs);
- bch2_alloc_sectors_done(c, wp);
-out:
- bkey_copy(&b->key, &tmp.k);
- b->ob = obs;
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- return b;
-err:
- bch2_btree_node_to_freelist(c, b);
- return ERR_PTR(ret);
-}
-
-static struct btree *bch2_btree_node_alloc(struct btree_update *as,
- struct btree_trans *trans,
- unsigned level)
-{
- struct bch_fs *c = as->c;
- struct btree *b;
- struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
- int ret;
-
- BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!p->nr);
-
- b = p->b[--p->nr];
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
-
- set_btree_node_accessed(b);
- set_btree_node_dirty_acct(c, b);
- set_btree_node_need_write(b);
-
- bch2_bset_init_first(b, &b->data->keys);
- b->c.level = level;
- b->c.btree_id = as->btree_id;
- b->version_ondisk = c->sb.version;
-
- memset(&b->nr, 0, sizeof(b->nr));
- b->data->magic = cpu_to_le64(bset_magic(c));
- memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
- b->data->flags = 0;
- SET_BTREE_NODE_ID(b->data, as->btree_id);
- SET_BTREE_NODE_LEVEL(b->data, level);
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
-
- bp->v.mem_ptr = 0;
- bp->v.seq = b->data->keys.seq;
- bp->v.sectors_written = 0;
- }
-
- SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
- bch2_btree_build_aux_trees(b);
-
- ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
- BUG_ON(ret);
-
- trace_and_count(c, btree_node_alloc, trans, b);
- bch2_increment_clock(c, btree_sectors(c), WRITE);
- return b;
-}
-
-static void btree_set_min(struct btree *b, struct bpos pos)
-{
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
- b->data->min_key = pos;
-}
-
-static void btree_set_max(struct btree *b, struct bpos pos)
-{
- b->key.k.p = pos;
- b->data->max_key = pos;
-}
-
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
- struct btree_trans *trans,
- struct btree *b)
-{
- struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
- struct bkey_format format = bch2_btree_calc_format(b);
-
- /*
- * The keys might expand with the new format - if they wouldn't fit in
- * the btree node anymore, use the old format for now:
- */
- if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
- format = b->format;
-
- SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
-
- btree_set_min(n, b->data->min_key);
- btree_set_max(n, b->data->max_key);
-
- n->data->format = format;
- btree_node_set_format(n, format);
-
- bch2_btree_sort_into(as->c, n, b);
-
- btree_node_reset_sib_u64s(n);
- return n;
-}
-
-static struct btree *__btree_root_alloc(struct btree_update *as,
- struct btree_trans *trans, unsigned level)
-{
- struct btree *b = bch2_btree_node_alloc(as, trans, level);
-
- btree_set_min(b, POS_MIN);
- btree_set_max(b, SPOS_MAX);
- b->data->format = bch2_btree_calc_format(b);
-
- btree_node_set_format(b, b->data->format);
- bch2_btree_build_aux_trees(b);
-
- return b;
-}
-
-static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
-{
- struct bch_fs *c = as->c;
- struct prealloc_nodes *p;
-
- for (p = as->prealloc_nodes;
- p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
- p++) {
- while (p->nr) {
- struct btree *b = p->b[--p->nr];
-
- mutex_lock(&c->btree_reserve_cache_lock);
-
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
-
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
-
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- __btree_node_free(trans, b);
- bch2_btree_node_to_freelist(c, b);
- }
- }
-}
-
-static int bch2_btree_reserve_get(struct btree_trans *trans,
- struct btree_update *as,
- unsigned nr_nodes[2],
- unsigned target,
- unsigned flags,
- struct closure *cl)
-{
- struct btree *b;
- unsigned interior;
- int ret = 0;
-
- BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
-
- /*
- * Protects reaping from the btree node cache and using the btree node
- * open bucket reserve:
- */
- ret = bch2_btree_cache_cannibalize_lock(trans, cl);
- if (ret)
- return ret;
-
- for (interior = 0; interior < 2; interior++) {
- struct prealloc_nodes *p = as->prealloc_nodes + interior;
-
- while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
- interior, target, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err;
- }
-
- p->b[p->nr++] = b;
- }
- }
-err:
- bch2_btree_cache_cannibalize_unlock(trans);
- return ret;
-}
-
-/* Asynchronous interior node update machinery */
-
-static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
-{
- struct bch_fs *c = as->c;
-
- if (as->took_gc_lock)
- up_read(&c->gc_lock);
- as->took_gc_lock = false;
-
- bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_pin_flush(&c->journal, &as->journal);
- bch2_disk_reservation_put(c, &as->disk_res);
- bch2_btree_reserve_put(as, trans);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
- as->start_time);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_del(&as->unwritten_list);
- list_del(&as->list);
-
- closure_debug_destroy(&as->cl);
- mempool_free(as, &c->btree_interior_update_pool);
-
- /*
- * Have to do the wakeup with btree_interior_update_lock still held,
- * since being on btree_interior_update_list is our ref on @c:
- */
- closure_wake_up(&c->btree_interior_update_wait);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void btree_update_add_key(struct btree_update *as,
- struct keylist *keys, struct btree *b)
-{
- struct bkey_i *k = &b->key;
-
- BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
- ARRAY_SIZE(as->_old_keys));
-
- bkey_copy(keys->top, k);
- bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
-
- bch2_keylist_push(keys);
-}
-
-static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
-{
- for_each_keylist_key(&as->new_keys, k)
- if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
- return false;
- return true;
-}
-
-static void btree_update_new_nodes_mark_sb(struct btree_update *as)
-{
- struct bch_fs *c = as->c;
-
- mutex_lock(&c->sb_lock);
- for_each_keylist_key(&as->new_keys, k)
- bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
-
-/*
- * The transactional part of an interior btree node update, where we journal the
- * update we did to the interior node and update alloc info:
- */
-static int btree_update_nodes_written_trans(struct btree_trans *trans,
- struct btree_update *as)
-{
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
- int ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
-
- trans->journal_pin = &as->journal;
-
- for_each_keylist_key(&as->old_keys, k) {
- unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
- ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
- BTREE_TRIGGER_transactional);
- if (ret)
- return ret;
- }
-
- for_each_keylist_key(&as->new_keys, k) {
- unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
-
- ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
- BTREE_TRIGGER_transactional);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
-static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
-{
- struct btree_node *b_data = READ_ONCE(b->data);
-
- return (b_data ? b_data->keys.seq : 0) == seq;
-}
-
-static void btree_update_nodes_written(struct btree_update *as)
-{
- struct bch_fs *c = as->c;
- struct btree *b;
- struct btree_trans *trans = bch2_trans_get(c);
- u64 journal_seq = 0;
- unsigned i;
- int ret;
-
- /*
- * If we're already in an error state, it might be because a btree node
- * was never written, and we might be trying to free that same btree
- * node here, but it won't have been marked as allocated and we'll see
- * spurious disk usage inconsistencies in the transactional part below
- * if we don't skip it:
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- goto err;
-
- if (!btree_update_new_nodes_marked_sb(as))
- btree_update_new_nodes_mark_sb(as);
-
- /*
- * Wait for any in flight writes to finish before we free the old nodes
- * on disk. But we haven't pinned those old nodes in the btree cache,
- * they might have already been evicted.
- *
- * The update we're completing deleted references to those nodes from the
- * btree, so we know if they've been evicted they can't be pulled back in.
- * We just have to check if the nodes we have pointers to are still those
- * old nodes, and haven't been reused.
- *
- * This can't be done locklessly because the data buffer might have been
- * vmalloc allocated, and they're not RCU freed. We also need the
- * __no_kmsan_checks annotation because even with the btree node read
- * lock, nothing tells us that the data buffer has been initialized (if
- * the btree node has been reused for a different node, and the data
- * buffer swapped for a new data buffer).
- */
- for (i = 0; i < as->nr_old_nodes; i++) {
- b = as->old_nodes[i];
-
- bch2_trans_begin(trans);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
- six_unlock_read(&b->c.lock);
- bch2_trans_unlock_long(trans);
-
- if (seq_matches)
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
- TASK_UNINTERRUPTIBLE);
- }
-
- /*
- * We did an update to a parent node where the pointers we added pointed
- * to child nodes that weren't written yet: now, the child nodes have
- * been written so we can write out the update to the interior node.
- */
-
- /*
- * We can't call into journal reclaim here: we'd block on the journal
- * reclaim lock, but we may need to release the open buckets we have
- * pinned in order for other btree updates to make forward progress, and
- * journal reclaim does btree updates when flushing bkey_cached entries,
- * which may require allocations as well.
- */
- ret = commit_do(trans, &as->disk_res, &journal_seq,
- BCH_WATERMARK_interior_updates|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_journal_reclaim,
- btree_update_nodes_written_trans(trans, as));
- bch2_trans_unlock(trans);
-
- bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "%s", bch2_err_str(ret));
-err:
- /*
- * Ensure transaction is unlocked before using btree_node_lock_nopath()
- * (the use of which is always suspect, we need to work on removing this
- * in the future)
- *
- * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
- * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
- * rarely end up with a locked path besides the one we have here:
- */
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
-
- /*
- * We have to be careful because another thread might be getting ready
- * to free as->b and calling btree_update_reparent() on us - we'll
- * recheck under btree_update_lock below:
- */
- b = READ_ONCE(as->b);
- if (b) {
- /*
- * @b is the node we did the final insert into:
- *
- * On failure to get a journal reservation, we still have to
- * unblock the write and allow most of the write path to happen
- * so that shutdown works, but the i->journal_seq mechanism
- * won't work to prevent the btree write from being visible (we
- * didn't get a journal sequence number) - instead
- * __bch2_btree_node_write() doesn't do the actual write if
- * we're in journal error state:
- */
-
- btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
- as->btree_id, b->c.level, b->key.k.p);
- struct btree_path *path = trans->paths + path_idx;
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
- path->l[b->c.level].b = b;
-
- bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-
- mutex_lock(&c->btree_interior_update_lock);
-
- list_del(&as->write_blocked_list);
- if (list_empty(&b->write_blocked))
- clear_btree_node_write_blocked(b);
-
- /*
- * Node might have been freed, recheck under
- * btree_interior_update_lock:
- */
- if (as->b == b) {
- BUG_ON(!b->c.level);
- BUG_ON(!btree_node_dirty(b));
-
- if (!ret) {
- struct bset *last = btree_bset_last(b);
-
- last->journal_seq = cpu_to_le64(
- max(journal_seq,
- le64_to_cpu(last->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
- } else {
- /*
- * If we didn't get a journal sequence number we
- * can't write this btree node, because recovery
- * won't know to ignore this write:
- */
- set_btree_node_never_write(b);
- }
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- six_unlock_write(&b->c.lock);
-
- btree_node_write_if_need(trans, b, SIX_LOCK_intent);
- btree_node_unlock(trans, path, b->c.level);
- bch2_path_put(trans, path_idx, true);
- }
-
- bch2_journal_pin_drop(&c->journal, &as->journal);
-
- mutex_lock(&c->btree_interior_update_lock);
- for (i = 0; i < as->nr_new_nodes; i++) {
- b = as->new_nodes[i];
-
- BUG_ON(b->will_make_reachable != (unsigned long) as);
- b->will_make_reachable = 0;
- clear_btree_node_will_make_reachable(b);
- }
- mutex_unlock(&c->btree_interior_update_lock);
-
- for (i = 0; i < as->nr_new_nodes; i++) {
- b = as->new_nodes[i];
-
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- btree_node_write_if_need(trans, b, SIX_LOCK_read);
- six_unlock_read(&b->c.lock);
- }
-
- for (i = 0; i < as->nr_open_buckets; i++)
- bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
-
- bch2_btree_update_free(as, trans);
- bch2_trans_put(trans);
-}
-
-static void btree_interior_update_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, btree_interior_update_work);
- struct btree_update *as;
-
- while (1) {
- mutex_lock(&c->btree_interior_update_lock);
- as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
- struct btree_update, unwritten_list);
- if (as && !as->nodes_written)
- as = NULL;
- mutex_unlock(&c->btree_interior_update_lock);
-
- if (!as)
- break;
-
- btree_update_nodes_written(as);
- }
-}
-
-static CLOSURE_CALLBACK(btree_update_set_nodes_written)
-{
- closure_type(as, struct btree_update, cl);
- struct bch_fs *c = as->c;
-
- mutex_lock(&c->btree_interior_update_lock);
- as->nodes_written = true;
- mutex_unlock(&c->btree_interior_update_lock);
-
- queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-}
-
-/*
- * We're updating @b with pointers to nodes that haven't finished writing yet:
- * block @b from being written until @as completes
- */
-static void btree_update_updated_node(struct btree_update *as, struct btree *b)
-{
- struct bch_fs *c = as->c;
-
- BUG_ON(as->mode != BTREE_UPDATE_none);
- BUG_ON(as->update_level_end < b->c.level);
- BUG_ON(!btree_node_dirty(b));
- BUG_ON(!b->c.level);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- as->mode = BTREE_UPDATE_node;
- as->b = b;
- as->update_level_end = b->c.level;
-
- set_btree_node_write_blocked(b);
- list_add(&as->write_blocked_list, &b->write_blocked);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static int bch2_update_reparent_journal_pin_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- return 0;
-}
-
-static void btree_update_reparent(struct btree_update *as,
- struct btree_update *child)
-{
- struct bch_fs *c = as->c;
-
- lockdep_assert_held(&c->btree_interior_update_lock);
-
- child->b = NULL;
- child->mode = BTREE_UPDATE_update;
-
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
- bch2_update_reparent_journal_pin_flush);
-}
-
-static void btree_update_updated_root(struct btree_update *as, struct btree *b)
-{
- struct bkey_i *insert = &b->key;
- struct bch_fs *c = as->c;
-
- BUG_ON(as->mode != BTREE_UPDATE_none);
-
- BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
- ARRAY_SIZE(as->journal_entries));
-
- as->journal_u64s +=
- journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- insert, insert->k.u64s);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
- as->mode = BTREE_UPDATE_root;
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/*
- * bch2_btree_update_add_new_node:
- *
- * This causes @as to wait on @b to be written, before it gets to
- * bch2_btree_update_nodes_written
- *
- * Additionally, it sets b->will_make_reachable to prevent any additional writes
- * to @b from happening besides the first until @b is reachable on disk
- *
- * And it adds @b to the list of @as's new nodes, so that we can update sector
- * counts in bch2_btree_update_nodes_written:
- */
-static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
-{
- struct bch_fs *c = as->c;
-
- closure_get(&as->cl);
-
- mutex_lock(&c->btree_interior_update_lock);
- BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
- BUG_ON(b->will_make_reachable);
-
- as->new_nodes[as->nr_new_nodes++] = b;
- b->will_make_reachable = 1UL|(unsigned long) as;
- set_btree_node_will_make_reachable(b);
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- btree_update_add_key(as, &as->new_keys, b);
-
- if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
- unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
- unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
-
- bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
- cpu_to_le16(sectors);
- }
-}
-
-/*
- * returns true if @b was a new node
- */
-static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
-{
- struct btree_update *as;
- unsigned long v;
- unsigned i;
-
- mutex_lock(&c->btree_interior_update_lock);
- /*
- * When b->will_make_reachable != 0, it owns a ref on as->cl that's
- * dropped when it gets written by bch2_btree_complete_write - the
- * xchg() is for synchronization with bch2_btree_complete_write:
- */
- v = xchg(&b->will_make_reachable, 0);
- clear_btree_node_will_make_reachable(b);
- as = (struct btree_update *) (v & ~1UL);
-
- if (!as) {
- mutex_unlock(&c->btree_interior_update_lock);
- return;
- }
-
- for (i = 0; i < as->nr_new_nodes; i++)
- if (as->new_nodes[i] == b)
- goto found;
-
- BUG();
-found:
- array_remove_item(as->new_nodes, as->nr_new_nodes, i);
- mutex_unlock(&c->btree_interior_update_lock);
-
- if (v & 1)
- closure_put(&as->cl);
-}
-
-static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
-{
- while (b->ob.nr)
- as->open_buckets[as->nr_open_buckets++] =
- b->ob.v[--b->ob.nr];
-}
-
-static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- return 0;
-}
-
-/*
- * @b is being split/rewritten: it may have pointers to not-yet-written btree
- * nodes and thus outstanding btree_updates - redirect @b's
- * btree_updates to point to this btree_update:
- */
-static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
- struct btree *b)
-{
- struct bch_fs *c = as->c;
- struct btree_update *p, *n;
- struct btree_write *w;
-
- set_btree_node_dying(b);
-
- if (btree_node_fake(b))
- return;
-
- mutex_lock(&c->btree_interior_update_lock);
-
- /*
- * Does this node have any btree_update operations preventing
- * it from being written?
- *
- * If so, redirect them to point to this btree_update: we can
- * write out our new nodes, but we won't make them visible until those
- * operations complete
- */
- list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
- list_del_init(&p->write_blocked_list);
- btree_update_reparent(as, p);
-
- /*
- * for flush_held_btree_writes() waiting on updates to flush or
- * nodes to be writeable:
- */
- closure_wake_up(&c->btree_interior_update_wait);
- }
-
- clear_btree_node_dirty_acct(c, b);
- clear_btree_node_need_write(b);
- clear_btree_node_write_blocked(b);
-
- /*
- * Does this node have unwritten data that has a pin on the journal?
- *
- * If so, transfer that pin to the btree_update operation -
- * note that if we're freeing multiple nodes, we only need to keep the
- * oldest pin of any of the nodes we're freeing. We'll release the pin
- * when the new nodes are persistent and reachable on disk:
- */
- w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
- bch2_btree_update_will_free_node_journal_pin_flush);
- bch2_journal_pin_drop(&c->journal, &w->journal);
-
- w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
- bch2_btree_update_will_free_node_journal_pin_flush);
- bch2_journal_pin_drop(&c->journal, &w->journal);
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * Is this a node that isn't reachable on disk yet?
- *
- * Nodes that aren't reachable yet have writes blocked until they're
- * reachable - now that we've cancelled any pending writes and moved
- * things waiting on that write to wait on this update, we can drop this
- * node from the list of nodes that the other update is making
- * reachable, prior to freeing it:
- */
- btree_update_drop_new_node(c, b);
-
- btree_update_add_key(as, &as->old_keys, b);
-
- as->old_nodes[as->nr_old_nodes] = b;
- as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
- as->nr_old_nodes++;
-}
-
-static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
-{
- struct bch_fs *c = as->c;
- u64 start_time = as->start_time;
-
- BUG_ON(as->mode == BTREE_UPDATE_none);
-
- if (as->took_gc_lock)
- up_read(&as->c->gc_lock);
- as->took_gc_lock = false;
-
- bch2_btree_reserve_put(as, trans);
-
- continue_at(&as->cl, btree_update_set_nodes_written,
- as->c->btree_interior_update_worker);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
- start_time);
-}
-
-static const char * const btree_node_reawrite_reason_strs[] = {
-#define x(n) #n,
- BTREE_NODE_REWRITE_REASON()
-#undef x
- NULL,
-};
-
-static struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level_start, bool split,
- unsigned target, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_update *as;
- u64 start_time = local_clock();
- int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
- unsigned nr_nodes[2] = { 0, 0 };
- unsigned level_end = level_start;
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- int ret = 0;
- u32 restart_count = trans->restart_count;
-
- BUG_ON(!path->should_be_locked);
-
- if (watermark == BCH_WATERMARK_copygc)
- watermark = BCH_WATERMARK_btree_copygc;
- if (watermark < BCH_WATERMARK_btree)
- watermark = BCH_WATERMARK_btree;
-
- flags &= ~BCH_WATERMARK_MASK;
- flags |= watermark;
-
- if (watermark < BCH_WATERMARK_reclaim &&
- test_bit(JOURNAL_space_low, &c->journal.flags)) {
- if (flags & BCH_TRANS_COMMIT_journal_reclaim)
- return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
-
- ret = drop_locks_do(trans,
- ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; }));
- if (ret)
- return ERR_PTR(ret);
- }
-
- while (1) {
- nr_nodes[!!level_end] += 1 + split;
- level_end++;
-
- ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
- if (ret)
- return ERR_PTR(ret);
-
- if (!btree_path_node(path, level_end)) {
- /* Allocating new root? */
- nr_nodes[1] += split;
- level_end = BTREE_MAX_DEPTH;
- break;
- }
-
- /*
- * Always check for space for two keys, even if we won't have to
- * split at prior level - it might have been a merge instead:
- */
- if (bch2_btree_node_insert_fits(path->l[level_end].b,
- BKEY_BTREE_PTR_U64s_MAX * 2))
- break;
-
- split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
- }
-
- if (!down_read_trylock(&c->gc_lock)) {
- ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
- if (ret) {
- up_read(&c->gc_lock);
- return ERR_PTR(ret);
- }
- }
-
- as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
- memset(as, 0, sizeof(*as));
- closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->ip_started = _RET_IP_;
- as->mode = BTREE_UPDATE_none;
- as->flags = flags;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level_start = level_start;
- as->update_level_end = level_end;
- INIT_LIST_HEAD(&as->list);
- INIT_LIST_HEAD(&as->unwritten_list);
- INIT_LIST_HEAD(&as->write_blocked_list);
- bch2_keylist_init(&as->old_keys, as->_old_keys);
- bch2_keylist_init(&as->new_keys, as->_new_keys);
- bch2_keylist_init(&as->parent_keys, as->inline_keys);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->list, &c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- struct btree *b = btree_path_node(path, path->level);
- as->node_start = b->data->min_key;
- as->node_end = b->data->max_key;
- as->node_needed_rewrite = btree_node_rewrite_reason(b);
- as->node_written = b->written;
- as->node_sectors = btree_buf_bytes(b) >> 9;
- as->node_remaining = __bch2_btree_u64s_remaining(b,
- btree_bkey_last(b, bset_tree_last(b)));
-
- /*
- * We don't want to allocate if we're in an error state, that can cause
- * deadlock on emergency shutdown due to open buckets getting stuck in
- * the btree_reserve_cache after allocator shutdown has cleared it out.
- * This check needs to come after adding us to the btree_interior_update
- * list but before calling bch2_btree_reserve_get, to synchronize with
- * __bch2_fs_read_only().
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- goto err;
-
- ret = bch2_disk_reservation_get(c, &as->disk_res,
- (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
- READ_ONCE(c->opts.metadata_replicas),
- disk_res_flags);
- if (ret)
- goto err;
-
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL);
- if (bch2_err_matches(ret, ENOSPC) ||
- bch2_err_matches(ret, ENOMEM)) {
- struct closure cl;
-
- /*
- * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
- * flag
- */
- if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < BCH_WATERMARK_reclaim) {
- ret = bch_err_throw(c, journal_reclaim_would_deadlock);
- goto err;
- }
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
- if (!bch2_err_matches(ret, BCH_ERR_operation_blocked))
- break;
- bch2_trans_unlock(trans);
- bch2_wait_on_allocator(c, &cl);
- } while (1);
- }
-
- if (ret) {
- trace_and_count(c, btree_reserve_get_fail, trans->fn,
- _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
- goto err;
- }
-
- ret = bch2_trans_relock(trans);
- if (ret)
- goto err;
-
- bch2_trans_verify_not_restarted(trans, restart_count);
- return as;
-err:
- bch2_btree_update_free(as, trans);
- if (!bch2_err_matches(ret, ENOSPC) &&
- !bch2_err_matches(ret, EROFS) &&
- ret != -BCH_ERR_journal_reclaim_would_deadlock &&
- ret != -BCH_ERR_journal_shutdown)
- bch_err_fn_ratelimited(c, ret);
- return ERR_PTR(ret);
-}
-
-/* Btree root updates: */
-
-static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
-{
- /* Root nodes cannot be reaped */
- mutex_lock(&c->btree_cache.lock);
- list_del_init(&b->list);
- mutex_unlock(&c->btree_cache.lock);
-
- mutex_lock(&c->btree_root_lock);
- bch2_btree_id_root(c, b->c.btree_id)->b = b;
- mutex_unlock(&c->btree_root_lock);
-
- bch2_recalc_btree_reserve(c);
-}
-
-static int bch2_btree_set_root(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- bool nofail)
-{
- struct bch_fs *c = as->c;
-
- trace_and_count(c, btree_node_set_root, trans, b);
-
- struct btree *old = btree_node_root(c, b);
-
- /*
- * Ensure no one is using the old root while we switch to the
- * new root:
- */
- if (nofail) {
- bch2_btree_node_lock_write_nofail(trans, path, &old->c);
- } else {
- int ret = bch2_btree_node_lock_write(trans, path, &old->c);
- if (ret)
- return ret;
- }
-
- bch2_btree_set_root_inmem(c, b);
-
- btree_update_updated_root(as, b);
-
- /*
- * Unlock old root after new root is visible:
- *
- * The new root isn't persistent, but that's ok: we still have
- * an intent lock on the new root, and any updates that would
- * depend on the new root would have to update the new root.
- */
- bch2_btree_node_unlock_write(trans, path, old);
- return 0;
-}
-
-/* Interior node updates: */
-
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_i *insert)
-{
- struct bch_fs *c = as->c;
- struct bkey_packed *k;
- struct printbuf buf = PRINTBUF;
- unsigned long old, new;
-
- BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
- !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
-
- if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
- bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
-
- struct bkey_validate_context from = (struct bkey_validate_context) {
- .from = BKEY_VALIDATE_btree_node,
- .level = b->c.level,
- .btree = b->c.btree_id,
- .flags = BCH_VALIDATE_commit,
- };
- if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
- bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
- dump_stack();
- }
-
- BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
- ARRAY_SIZE(as->journal_entries));
-
- as->journal_u64s +=
- journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
- BCH_JSET_ENTRY_btree_keys,
- b->c.btree_id, b->c.level,
- insert, insert->k.u64s);
-
- while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
- bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
- bch2_btree_node_iter_advance(node_iter, b);
-
- bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
- set_btree_node_dirty_acct(c, b);
-
- old = READ_ONCE(b->flags);
- do {
- new = old;
-
- new &= ~BTREE_WRITE_TYPE_MASK;
- new |= BTREE_WRITE_interior;
- new |= 1 << BTREE_NODE_need_write;
- } while (!try_cmpxchg(&b->flags, &old, new));
-
- printbuf_exit(&buf);
-}
-
-static int
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
-{
- struct bkey_i *insert = bch2_keylist_front(keys);
- struct bkey_packed *k;
-
- BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
- while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
- (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
- ;
-
- for (;
- insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
- insert = bkey_next(insert))
- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
-
- int ret = bch2_btree_node_check_topology(trans, b);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- for (struct bkey_i *k = keys->keys;
- k != insert;
- k = bkey_next(k)) {
- bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
- prt_newline(&buf);
- }
-
- bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s",
- (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf);
- dump_stack();
- return ret;
- }
-
- memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
- keys->top_p -= insert->_data - keys->keys_p;
- return 0;
-}
-
-static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
-{
- if (insert_keys)
- for_each_keylist_key(insert_keys, k)
- if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
- return true;
- return false;
-}
-
-/*
- * Move keys from n1 (original replacement node, now lower node) to n2 (higher
- * node)
- */
-static void __btree_split_node(struct btree_update *as,
- struct btree_trans *trans,
- struct btree *b,
- struct btree *n[2],
- struct keylist *insert_keys)
-{
- struct bkey_packed *k;
- struct bpos n1_pos = POS_MIN;
- struct btree_node_iter iter;
- struct bset *bsets[2];
- struct bkey_format_state format[2];
- struct bkey_packed *out[2];
- struct bkey uk;
- unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
- struct { unsigned nr_keys, val_u64s; } nr_keys[2];
- int i;
-
- memset(&nr_keys, 0, sizeof(nr_keys));
-
- for (i = 0; i < 2; i++) {
- BUG_ON(n[i]->nsets != 1);
-
- bsets[i] = btree_bset_first(n[i]);
- out[i] = bsets[i]->start;
-
- SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
- bch2_bkey_format_init(&format[i]);
- }
-
- u64s = 0;
- for_each_btree_node_key(b, k, &iter) {
- if (bkey_deleted(k))
- continue;
-
- uk = bkey_unpack_key(b, k);
-
- if (b->c.level &&
- u64s < n1_u64s &&
- u64s + k->u64s >= n1_u64s &&
- (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
- key_deleted_in_insert(insert_keys, uk.p)))
- n1_u64s += k->u64s;
-
- i = u64s >= n1_u64s;
- u64s += k->u64s;
- if (!i)
- n1_pos = uk.p;
- bch2_bkey_format_add_key(&format[i], &uk);
-
- nr_keys[i].nr_keys++;
- nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
- }
-
- btree_set_min(n[0], b->data->min_key);
- btree_set_max(n[0], n1_pos);
- btree_set_min(n[1], bpos_successor(n1_pos));
- btree_set_max(n[1], b->data->max_key);
-
- for (i = 0; i < 2; i++) {
- bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
- bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
-
- n[i]->data->format = bch2_bkey_format_done(&format[i]);
-
- unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
- nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
- n[i]->data->format = b->format;
-
- btree_node_set_format(n[i], n[i]->data->format);
- }
-
- u64s = 0;
- for_each_btree_node_key(b, k, &iter) {
- if (bkey_deleted(k))
- continue;
-
- i = u64s >= n1_u64s;
- u64s += k->u64s;
-
- if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
- ? &b->format: &bch2_bkey_format_current, k))
- out[i]->format = KEY_FORMAT_LOCAL_BTREE;
- else
- bch2_bkey_unpack(b, (void *) out[i], k);
-
- out[i]->needs_whiteout = false;
-
- btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
- out[i] = bkey_p_next(out[i]);
- }
-
- for (i = 0; i < 2; i++) {
- bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
-
- BUG_ON(!bsets[i]->u64s);
-
- set_btree_bset_end(n[i], n[i]->set);
-
- btree_node_reset_sib_u64s(n[i]);
-
- bch2_verify_btree_nr_keys(n[i]);
-
- BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
- }
-}
-
-/*
- * For updates to interior nodes, we've got to do the insert before we split
- * because the stuff we're inserting has to be inserted atomically. Post split,
- * the keys might have to go in different nodes and the split would no longer be
- * atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the insert after
- * we do the split (and pick the pivot) - the pivot we pick might be between
- * nodes that were coalesced, and thus in the middle of a child node post
- * coalescing:
- */
-static int btree_split_insert_keys(struct btree_update *as,
- struct btree_trans *trans,
- btree_path_idx_t path_idx,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *path = trans->paths + path_idx;
-
- if (!bch2_keylist_empty(keys) &&
- bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
- struct btree_node_iter node_iter;
-
- bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
-
- int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int btree_split(struct btree_update *as, struct btree_trans *trans,
- btree_path_idx_t path, struct btree *b,
- struct keylist *keys)
-{
- struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(trans->paths + path, b);
- struct btree *n1, *n2 = NULL, *n3 = NULL;
- btree_path_idx_t path1 = 0, path2 = 0;
- u64 start_time = local_clock();
- int ret = 0;
-
- bch2_verify_btree_nr_keys(b);
- BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
-
- ret = bch2_btree_node_check_topology(trans, b);
- if (ret)
- return ret;
-
- if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
- struct btree *n[2];
-
- trace_and_count(c, btree_node_split, trans, b);
-
- n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
- n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
-
- __btree_split_node(as, trans, b, n, keys);
-
- if (keys) {
- ret = btree_split_insert_keys(as, trans, path, n1, keys) ?:
- btree_split_insert_keys(as, trans, path, n2, keys);
- if (ret)
- goto err;
- BUG_ON(!bch2_keylist_empty(keys));
- }
-
- bch2_btree_build_aux_trees(n2);
- bch2_btree_build_aux_trees(n1);
-
- bch2_btree_update_add_new_node(as, n1);
- bch2_btree_update_add_new_node(as, n2);
- six_unlock_write(&n2->c.lock);
- six_unlock_write(&n1->c.lock);
-
- path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
- six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
- path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p);
- six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path2, n2);
-
- /*
- * Note that on recursive parent_keys == keys, so we
- * can't start adding new keys to parent_keys before emptying it
- * out (which we did with btree_split_insert_keys() above)
- */
- bch2_keylist_add(&as->parent_keys, &n1->key);
- bch2_keylist_add(&as->parent_keys, &n2->key);
-
- if (!parent) {
- /* Depth increases, make a new root */
- n3 = __btree_root_alloc(as, trans, b->c.level + 1);
-
- bch2_btree_update_add_new_node(as, n3);
- six_unlock_write(&n3->c.lock);
-
- trans->paths[path2].locks_want++;
- BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
- six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path2, n3);
-
- n3->sib_u64s[0] = U16_MAX;
- n3->sib_u64s[1] = U16_MAX;
-
- ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
- if (ret)
- goto err;
- }
- } else {
- trace_and_count(c, btree_node_compact, trans, b);
-
- n1 = bch2_btree_node_alloc_replacement(as, trans, b);
-
- if (keys) {
- ret = btree_split_insert_keys(as, trans, path, n1, keys);
- if (ret)
- goto err;
- BUG_ON(!bch2_keylist_empty(keys));
- }
-
- bch2_btree_build_aux_trees(n1);
- bch2_btree_update_add_new_node(as, n1);
- six_unlock_write(&n1->c.lock);
-
- path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p);
- six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + path1, n1);
-
- if (parent)
- bch2_keylist_add(&as->parent_keys, &n1->key);
- }
-
- /* New nodes all written, now make them visible: */
-
- if (parent) {
- /* Split a non root node */
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
- } else if (n3) {
- ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
- } else {
- /* Root filled up but didn't need to be split */
- ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
- }
-
- if (ret)
- goto err;
-
- bch2_btree_interior_update_will_free_node(as, b);
-
- if (n3) {
- bch2_btree_update_get_open_buckets(as, n3);
- bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
- }
- if (n2) {
- bch2_btree_update_get_open_buckets(as, n2);
- bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
- }
- bch2_btree_update_get_open_buckets(as, n1);
- bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
-
- /*
- * The old node must be freed (in memory) _before_ unlocking the new
- * nodes - else another thread could re-acquire a read lock on the old
- * node after another thread has locked and updated the new node, thus
- * seeing stale data:
- */
- bch2_btree_node_free_inmem(trans, trans->paths + path, b);
-
- if (n3)
- bch2_trans_node_add(trans, trans->paths + path, n3);
- if (n2)
- bch2_trans_node_add(trans, trans->paths + path2, n2);
- bch2_trans_node_add(trans, trans->paths + path1, n1);
-
- if (n3)
- six_unlock_intent(&n3->c.lock);
- if (n2)
- six_unlock_intent(&n2->c.lock);
- six_unlock_intent(&n1->c.lock);
-out:
- if (path2) {
- __bch2_btree_path_unlock(trans, trans->paths + path2);
- bch2_path_put(trans, path2, true);
- }
- if (path1) {
- __bch2_btree_path_unlock(trans, trans->paths + path1);
- bch2_path_put(trans, path1, true);
- }
-
- bch2_trans_verify_locks(trans);
-
- bch2_time_stats_update(&c->times[n2
- ? BCH_TIME_btree_node_split
- : BCH_TIME_btree_node_compact],
- start_time);
- return ret;
-err:
- if (n3)
- bch2_btree_node_free_never_used(as, trans, n3);
- if (n2)
- bch2_btree_node_free_never_used(as, trans, n2);
- bch2_btree_node_free_never_used(as, trans, n1);
- goto out;
-}
-
-/**
- * bch2_btree_insert_node - insert bkeys into a given btree node
- *
- * @as: btree_update object
- * @trans: btree_trans object
- * @path_idx: path that points to current node
- * @b: node to insert keys into
- * @keys: list of keys to insert
- *
- * Returns: 0 on success, typically transaction restart error on failure
- *
- * Inserts as many keys as it can into a given btree node, splitting it if full.
- * If a split occurred, this function will return early. This can only happen
- * for leaf nodes -- inserts into interior nodes have to be atomic.
- */
-static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- btree_path_idx_t path_idx, struct btree *b,
- struct keylist *keys)
-{
- struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx, *linked;
- unsigned i;
- int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
- int old_live_u64s = b->nr.live_u64s;
- int live_u64s_added, u64s_added;
- int ret;
-
- lockdep_assert_held(&c->gc_lock);
- BUG_ON(!b->c.level);
- BUG_ON(!as || as->b);
- bch2_verify_keylist_sorted(keys);
-
- if (!btree_node_intent_locked(path, b->c.level)) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "%s(): node not locked at level %u\n",
- __func__, b->c.level);
- bch2_btree_update_to_text(&buf, as);
- bch2_btree_path_to_text(&buf, trans, path_idx);
- bch2_fs_emergency_read_only2(c, &buf);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- return -EIO;
- }
-
- ret = bch2_btree_node_lock_write(trans, path, &b->c);
- if (ret)
- return ret;
-
- bch2_btree_node_prep_for_write(trans, path, b);
-
- if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
- bch2_btree_node_unlock_write(trans, path, b);
- goto split;
- }
-
-
- ret = bch2_btree_node_check_topology(trans, b) ?:
- bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
- if (ret) {
- bch2_btree_node_unlock_write(trans, path, b);
- return ret;
- }
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-
- live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
- if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
- if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
- if (u64s_added > live_u64s_added &&
- bch2_maybe_compact_whiteouts(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-
- btree_update_updated_node(as, b);
- bch2_btree_node_unlock_write(trans, path, b);
- return 0;
-split:
- /*
- * We could attempt to avoid the transaction restart, by calling
- * bch2_btree_path_upgrade() and allocating more nodes:
- */
- if (b->c.level >= as->update_level_end) {
- trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
- }
-
- return btree_split(as, trans, path_idx, b, keys);
-}
-
-int bch2_btree_split_leaf(struct btree_trans *trans,
- btree_path_idx_t path,
- unsigned flags)
-{
- /* btree_split & merge may both cause paths array to be reallocated */
- struct btree *b = path_l(trans->paths + path)->b;
- struct btree_update *as;
- unsigned l;
- int ret = 0;
-
- as = bch2_btree_update_start(trans, trans->paths + path,
- trans->paths[path].level,
- true, 0, flags);
- if (IS_ERR(as))
- return PTR_ERR(as);
-
- ret = btree_split(as, trans, path, b, NULL);
- if (ret) {
- bch2_btree_update_free(as, trans);
- return ret;
- }
-
- bch2_btree_update_done(as, trans);
-
- for (l = trans->paths[path].level + 1;
- btree_node_intent_locked(&trans->paths[path], l) && !ret;
- l++)
- ret = bch2_foreground_maybe_merge(trans, path, l, flags);
-
- return ret;
-}
-
-static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
- btree_path_idx_t path_idx)
-{
- struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
- struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
-
- BUG_ON(!btree_node_locked(path, b->c.level));
-
- n = __btree_root_alloc(as, trans, b->c.level + 1);
-
- bch2_btree_update_add_new_node(as, n);
- six_unlock_write(&n->c.lock);
-
- path->locks_want++;
- BUG_ON(btree_node_locked(path, n->c.level));
- six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path, n);
-
- n->sib_u64s[0] = U16_MAX;
- n->sib_u64s[1] = U16_MAX;
-
- bch2_keylist_add(&as->parent_keys, &b->key);
- btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
-
- int ret = bch2_btree_set_root(as, trans, path, n, true);
- BUG_ON(ret);
-
- bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
- bch2_trans_node_add(trans, path, n);
- six_unlock_intent(&n->c.lock);
-
- mutex_lock(&c->btree_cache.lock);
- list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
- mutex_unlock(&c->btree_cache.lock);
-
- bch2_trans_verify_locks(trans);
-}
-
-int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
-
- if (btree_node_fake(b))
- return bch2_btree_split_leaf(trans, path, flags);
-
- struct btree_update *as =
- bch2_btree_update_start(trans, trans->paths + path, b->c.level,
- true, 0, flags);
- if (IS_ERR(as))
- return PTR_ERR(as);
-
- __btree_increase_depth(as, trans, path);
- bch2_btree_update_done(as, trans);
- return 0;
-}
-
-int __bch2_foreground_maybe_merge(struct btree_trans *trans,
- btree_path_idx_t path,
- unsigned level,
- unsigned flags,
- enum btree_node_sibling sib)
-{
- struct bch_fs *c = trans->c;
- struct btree_update *as;
- struct bkey_format_state new_s;
- struct bkey_format new_f;
- struct bkey_i delete;
- struct btree *b, *m, *n, *prev, *next, *parent;
- struct bpos sib_pos;
- size_t sib_u64s;
- enum btree_id btree = trans->paths[path].btree_id;
- btree_path_idx_t sib_path = 0, new_path = 0;
- u64 start_time = local_clock();
- int ret = 0;
-
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
- BUG_ON(!trans->paths[path].should_be_locked);
- BUG_ON(!btree_node_locked(&trans->paths[path], level));
-
- /*
- * Work around a deadlock caused by the btree write buffer not doing
- * merges and leaving tons of merges for us to do - we really don't need
- * to be doing merges at all from the interior update path, and if the
- * interior update path is generating too many new interior updates we
- * deadlock:
- */
- if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
- return 0;
-
- if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
- flags &= ~BCH_WATERMARK_MASK;
- flags |= BCH_WATERMARK_btree;
- flags |= BCH_TRANS_COMMIT_journal_reclaim;
- }
-
- b = trans->paths[path].l[level].b;
-
- if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
- (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
- b->sib_u64s[sib] = U16_MAX;
- return 0;
- }
-
- sib_pos = sib == btree_prev_sib
- ? bpos_predecessor(b->data->min_key)
- : bpos_successor(b->data->max_key);
-
- sib_path = bch2_path_get(trans, btree, sib_pos,
- U8_MAX, level, BTREE_ITER_intent, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, sib_path, false);
- if (ret)
- goto err;
-
- btree_path_set_should_be_locked(trans, trans->paths + sib_path);
-
- m = trans->paths[sib_path].l[level].b;
-
- if (btree_node_parent(trans->paths + path, b) !=
- btree_node_parent(trans->paths + sib_path, m)) {
- b->sib_u64s[sib] = U16_MAX;
- goto out;
- }
-
- if (sib == btree_prev_sib) {
- prev = m;
- next = b;
- } else {
- prev = b;
- next = m;
- }
-
- if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
- struct printbuf buf = PRINTBUF;
-
- printbuf_indent_add_nextline(&buf, 2);
- prt_printf(&buf, "%s(): ", __func__);
- ret = __bch2_topology_error(c, &buf);
- prt_newline(&buf);
-
- prt_printf(&buf, "prev ends at ");
- bch2_bpos_to_text(&buf, prev->data->max_key);
- prt_newline(&buf);
-
- prt_printf(&buf, "next starts at ");
- bch2_bpos_to_text(&buf, next->data->min_key);
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- bch2_bkey_format_init(&new_s);
- bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
- __bch2_btree_calc_format(&new_s, prev);
- __bch2_btree_calc_format(&new_s, next);
- bch2_bkey_format_add_pos(&new_s, next->data->max_key);
- new_f = bch2_bkey_format_done(&new_s);
-
- sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
- btree_node_u64s_with_format(m->nr, &m->format, &new_f);
-
- if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
- sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
- sib_u64s /= 2;
- sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
- }
-
- sib_u64s = min(sib_u64s, btree_max_u64s(c));
- sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
- b->sib_u64s[sib] = sib_u64s;
-
- if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
- goto out;
-
- parent = btree_node_parent(trans->paths + path, b);
- as = bch2_btree_update_start(trans, trans->paths + path, level, false,
- 0, BCH_TRANS_COMMIT_no_enospc|flags);
- ret = PTR_ERR_OR_ZERO(as);
- if (ret)
- goto err;
-
- as->node_start = prev->data->min_key;
- as->node_end = next->data->max_key;
-
- trace_and_count(c, btree_node_merge, trans, b);
-
- n = bch2_btree_node_alloc(as, trans, b->c.level);
-
- SET_BTREE_NODE_SEQ(n->data,
- max(BTREE_NODE_SEQ(b->data),
- BTREE_NODE_SEQ(m->data)) + 1);
-
- btree_set_min(n, prev->data->min_key);
- btree_set_max(n, next->data->max_key);
-
- n->data->format = new_f;
- btree_node_set_format(n, new_f);
-
- bch2_btree_sort_into(c, n, prev);
- bch2_btree_sort_into(c, n, next);
-
- bch2_btree_build_aux_trees(n);
- bch2_btree_update_add_new_node(as, n);
- six_unlock_write(&n->c.lock);
-
- new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p);
- six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
- bkey_init(&delete.k);
- delete.k.p = prev->key.k.p;
- bch2_keylist_add(&as->parent_keys, &delete);
- bch2_keylist_add(&as->parent_keys, &n->key);
-
- bch2_trans_verify_paths(trans);
-
- ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
- if (ret)
- goto err_free_update;
-
- bch2_btree_interior_update_will_free_node(as, b);
- bch2_btree_interior_update_will_free_node(as, m);
-
- bch2_trans_verify_paths(trans);
-
- bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
- bch2_btree_node_free_inmem(trans, trans->paths + path, b);
- bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
-
- bch2_trans_node_add(trans, trans->paths + path, n);
-
- bch2_trans_verify_paths(trans);
-
- six_unlock_intent(&n->c.lock);
-
- bch2_btree_update_done(as, trans);
-
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
-out:
-err:
- if (new_path)
- bch2_path_put(trans, new_path, true);
- bch2_path_put(trans, sib_path, true);
- bch2_trans_verify_locks(trans);
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
- ret = 0;
- if (!ret)
- ret = bch2_trans_relock(trans);
- return ret;
-err_free_update:
- bch2_btree_node_free_never_used(as, trans, n);
- bch2_btree_update_free(as, trans);
- goto out;
-}
-
-static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b)
-{
- bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(trans, iter);
- if (ret)
- goto err;
-
- /* has node been freed? */
- if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
- /* node has been freed: */
- BUG_ON(!btree_node_dying(b));
- ret = bch_err_throw(trans->c, btree_node_dying);
- goto err;
- }
-
- BUG_ON(!btree_node_hashed(b));
- return 0;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree *b,
- unsigned target,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree *n, *parent;
- struct btree_update *as;
- btree_path_idx_t new_path = 0;
- int ret;
-
- flags |= BCH_TRANS_COMMIT_no_enospc;
-
- struct btree_path *path = btree_iter_path(trans, iter);
- parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, b->c.level,
- false, target, flags);
- ret = PTR_ERR_OR_ZERO(as);
- if (ret)
- goto out;
-
- n = bch2_btree_node_alloc_replacement(as, trans, b);
-
- bch2_btree_build_aux_trees(n);
- bch2_btree_update_add_new_node(as, n);
- six_unlock_write(&n->c.lock);
-
- new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p);
- six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, trans->paths + new_path, n);
-
- trace_and_count(c, btree_node_rewrite, trans, b);
-
- if (parent) {
- bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
- } else {
- ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
- }
-
- if (ret)
- goto err;
-
- bch2_btree_interior_update_will_free_node(as, b);
-
- bch2_btree_update_get_open_buckets(as, n);
- bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
-
- bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
-
- bch2_trans_node_add(trans, trans->paths + iter->path, n);
- six_unlock_intent(&n->c.lock);
-
- bch2_btree_update_done(as, trans);
-out:
- if (new_path)
- bch2_path_put(trans, new_path, true);
- bch2_trans_downgrade(trans);
- return ret;
-err:
- bch2_btree_node_free_never_used(as, trans, n);
- bch2_btree_update_free(as, trans);
- goto out;
-}
-
-int bch2_btree_node_rewrite_key(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_i *k, unsigned flags)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter,
- btree, k->k.p,
- BTREE_MAX_DEPTH, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto out;
-
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
- ret = found
- ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags)
- : -ENOENT;
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bpos pos,
- unsigned target,
- unsigned flags)
-{
- BUG_ON(!level);
-
- /* Traverse one depth lower to get a pointer to the node itself: */
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
- struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto err;
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
- struct btree *b, unsigned flags)
-{
- struct btree_iter iter;
- int ret = get_iter_to_node(trans, &iter, b);
- if (ret)
- return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-struct async_btree_rewrite {
- struct bch_fs *c;
- struct work_struct work;
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- struct bkey_buf key;
-};
-
-static void async_btree_node_rewrite_work(struct work_struct *work)
-{
- struct async_btree_rewrite *a =
- container_of(work, struct async_btree_rewrite, work);
- struct bch_fs *c = a->c;
-
- int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
- a->btree_id, a->level, a->key.k, 0));
- if (!bch2_err_matches(ret, ENOENT) &&
- !bch2_err_matches(ret, EROFS))
- bch_err_fn_ratelimited(c, ret);
-
- spin_lock(&c->btree_node_rewrites_lock);
- list_del(&a->list);
- spin_unlock(&c->btree_node_rewrites_lock);
-
- closure_wake_up(&c->btree_node_rewrites_wait);
-
- bch2_bkey_buf_exit(&a->key, c);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite);
- kfree(a);
-}
-
-void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
-{
- struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
- if (!a)
- return;
-
- a->c = c;
- a->btree_id = b->c.btree_id;
- a->level = b->c.level;
- INIT_WORK(&a->work, async_btree_node_rewrite_work);
-
- bch2_bkey_buf_init(&a->key);
- bch2_bkey_buf_copy(&a->key, c, &b->key);
-
- bool now = false, pending = false;
-
- spin_lock(&c->btree_node_rewrites_lock);
- if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) &&
- enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) {
- list_add(&a->list, &c->btree_node_rewrites);
- now = true;
- } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
- list_add(&a->list, &c->btree_node_rewrites_pending);
- pending = true;
- }
- spin_unlock(&c->btree_node_rewrites_lock);
-
- if (now) {
- queue_work(c->btree_node_rewrite_worker, &a->work);
- } else if (pending) {
- /* bch2_do_pending_node_rewrites will execute */
- } else {
- bch2_bkey_buf_exit(&a->key, c);
- kfree(a);
- }
-}
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
-{
- closure_wait_event(&c->btree_node_rewrites_wait,
- list_empty(&c->btree_node_rewrites));
-}
-
-void bch2_do_pending_node_rewrites(struct bch_fs *c)
-{
- while (1) {
- spin_lock(&c->btree_node_rewrites_lock);
- struct async_btree_rewrite *a =
- list_pop_entry(&c->btree_node_rewrites_pending,
- struct async_btree_rewrite, list);
- if (a)
- list_add(&a->list, &c->btree_node_rewrites);
- spin_unlock(&c->btree_node_rewrites_lock);
-
- if (!a)
- break;
-
- enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite);
- queue_work(c->btree_node_rewrite_worker, &a->work);
- }
-}
-
-void bch2_free_pending_node_rewrites(struct bch_fs *c)
-{
- while (1) {
- spin_lock(&c->btree_node_rewrites_lock);
- struct async_btree_rewrite *a =
- list_pop_entry(&c->btree_node_rewrites_pending,
- struct async_btree_rewrite, list);
- spin_unlock(&c->btree_node_rewrites_lock);
-
- if (!a)
- break;
-
- bch2_bkey_buf_exit(&a->key, c);
- kfree(a);
- }
-}
-
-static int __bch2_btree_node_update_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree *b, struct btree *new_hash,
- struct bkey_i *new_key,
- unsigned commit_flags,
- bool skip_triggers)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter2 = {};
- struct btree *parent;
- int ret;
-
- if (!skip_triggers) {
- ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s_c(&b->key),
- BTREE_TRIGGER_transactional) ?:
- bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s(new_key),
- BTREE_TRIGGER_transactional);
- if (ret)
- return ret;
- }
-
- if (new_hash) {
- bkey_copy(&new_hash->key, new_key);
- ret = bch2_btree_node_hash_insert(&c->btree_cache,
- new_hash, b->c.level, b->c.btree_id);
- BUG_ON(ret);
- }
-
- parent = btree_node_parent(btree_iter_path(trans, iter), b);
- if (parent) {
- bch2_trans_copy_iter(trans, &iter2, iter);
-
- iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
- iter2.flags & BTREE_ITER_intent,
- _THIS_IP_);
-
- struct btree_path *path2 = btree_iter_path(trans, &iter2);
- BUG_ON(path2->level != b->c.level);
- BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
-
- btree_path_set_level_up(trans, path2);
-
- trans->paths_sorted = false;
-
- ret = bch2_btree_iter_traverse(trans, &iter2) ?:
- bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
- if (ret)
- goto err;
- } else {
- BUG_ON(btree_node_root(c, b) != b);
-
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
- jset_u64s(new_key->k.u64s));
- ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- return ret;
-
- journal_entry_set(e,
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- new_key, new_key->k.u64s);
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
- if (ret)
- goto err;
-
- bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
-
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
-
- __bch2_btree_node_hash_remove(&c->btree_cache, b);
-
- bkey_copy(&b->key, new_key);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
- mutex_unlock(&c->btree_cache.lock);
- } else {
- bkey_copy(&b->key, new_key);
- }
-
- bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
-out:
- bch2_trans_iter_exit(trans, &iter2);
- return ret;
-err:
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
- }
- goto out;
-}
-
-int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b, struct bkey_i *new_key,
- unsigned commit_flags, bool skip_triggers)
-{
- struct bch_fs *c = trans->c;
- struct btree *new_hash = NULL;
- struct btree_path *path = btree_iter_path(trans, iter);
- struct closure cl;
- int ret = 0;
-
- ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
- if (ret)
- return ret;
-
- closure_init_stack(&cl);
-
- /*
- * check btree_ptr_hash_val() after @b is locked by
- * btree_iter_traverse():
- */
- if (btree_ptr_hash_val(new_key) != b->hash_val) {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- if (ret) {
- ret = drop_locks_do(trans, (closure_sync(&cl), 0));
- if (ret)
- return ret;
- }
-
- new_hash = bch2_btree_node_mem_alloc(trans, false);
- ret = PTR_ERR_OR_ZERO(new_hash);
- if (ret)
- goto err;
- }
-
- path->intent_ref++;
- ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
- commit_flags, skip_triggers);
- --path->intent_ref;
-
- if (new_hash)
- bch2_btree_node_to_freelist(c, new_hash);
-err:
- closure_sync(&cl);
- bch2_btree_cache_cannibalize_unlock(trans);
- return ret;
-}
-
-int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
- struct btree *b, struct bkey_i *new_key,
- unsigned commit_flags, bool skip_triggers)
-{
- struct btree_iter iter;
- int ret = get_iter_to_node(trans, &iter, b);
- if (ret)
- return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
-
- bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
- !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
-
- ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
- commit_flags, skip_triggers);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* Init code: */
-
-/*
- * Only for filesystem bringup, when first reading the btree roots or allocating
- * btree roots when initializing a new filesystem:
- */
-void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
-{
- BUG_ON(btree_node_root(c, b));
-
- bch2_btree_set_root_inmem(c, b);
-}
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level)
-{
- struct bch_fs *c = trans->c;
- struct closure cl;
- struct btree *b;
- int ret;
-
- closure_init_stack(&cl);
-
- do {
- ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
- closure_sync(&cl);
- } while (ret);
-
- b = bch2_btree_node_mem_alloc(trans, false);
- bch2_btree_cache_cannibalize_unlock(trans);
-
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- return ret;
-
- set_btree_node_fake(b);
- set_btree_node_need_rewrite(b);
- b->c.level = level;
- b->c.btree_id = id;
-
- bkey_btree_ptr_init(&b->key);
- b->key.k.p = SPOS_MAX;
- *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
-
- bch2_bset_init_first(b, &b->data->keys);
- bch2_btree_build_aux_trees(b);
-
- b->data->flags = 0;
- btree_set_min(b, POS_MIN);
- btree_set_max(b, SPOS_MAX);
- b->data->format = bch2_btree_calc_format(b);
- btree_node_set_format(b, b->data->format);
-
- ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
- b->c.level, b->c.btree_id);
- BUG_ON(ret);
-
- bch2_btree_set_root_inmem(c, b);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- return 0;
-}
-
-void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
-{
- bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
-}
-
-static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
-{
- prt_printf(out, "%ps: ", (void *) as->ip_started);
- bch2_trans_commit_flags_to_text(out, as->flags);
-
- prt_str(out, " ");
- bch2_btree_id_to_text(out, as->btree_id);
- prt_printf(out, " l=%u-%u ",
- as->update_level_start,
- as->update_level_end);
- bch2_bpos_to_text(out, as->node_start);
- prt_char(out, ' ');
- bch2_bpos_to_text(out, as->node_end);
- prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s",
- as->node_written,
- as->node_sectors,
- as->node_remaining,
- btree_node_reawrite_reason_strs[as->node_needed_rewrite]);
-
- prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
- bch2_btree_update_modes[as->mode],
- as->nodes_written,
- closure_nr_remaining(&as->cl),
- as->journal.seq);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct btree_update *as;
-
- mutex_lock(&c->btree_interior_update_lock);
- list_for_each_entry(as, &c->btree_interior_update_list, list)
- bch2_btree_update_to_text(out, as);
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
-{
- bool ret;
-
- mutex_lock(&c->btree_interior_update_lock);
- ret = !list_empty(&c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- return ret;
-}
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *c)
-{
- bool ret = bch2_btree_interior_updates_pending(c);
-
- if (ret)
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_pending(c));
- return ret;
-}
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
-{
- struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
- mutex_lock(&c->btree_root_lock);
-
- r->level = entry->level;
- r->alive = true;
- bkey_copy(&r->key, (struct bkey_i *) entry->start);
-
- mutex_unlock(&c->btree_root_lock);
-}
-
-struct jset_entry *
-bch2_btree_roots_to_journal_entries(struct bch_fs *c,
- struct jset_entry *end,
- unsigned long skip)
-{
- unsigned i;
-
- mutex_lock(&c->btree_root_lock);
-
- for (i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (r->alive && !test_bit(i, &skip)) {
- journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
- i, r->level, &r->key, r->key.k.u64s);
- end = vstruct_next(end);
- }
- }
-
- mutex_unlock(&c->btree_root_lock);
-
- return end;
-}
-
-static void bch2_btree_alloc_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct btree_alloc *a)
-{
- printbuf_indent_add(out, 2);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
- prt_newline(out);
-
- struct open_bucket *ob;
- unsigned i;
- open_bucket_for_each(c, &a->ob, ob, i)
- bch2_open_bucket_to_text(out, c, ob);
-
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
-{
- for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
- bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
-}
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
-{
- WARN_ON(!list_empty(&c->btree_node_rewrites));
- WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
-
- if (c->btree_node_rewrite_worker)
- destroy_workqueue(c->btree_node_rewrite_worker);
- if (c->btree_interior_update_worker)
- destroy_workqueue(c->btree_interior_update_worker);
- mempool_exit(&c->btree_interior_update_pool);
-}
-
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
-{
- mutex_init(&c->btree_reserve_cache_lock);
- INIT_LIST_HEAD(&c->btree_interior_update_list);
- INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
- mutex_init(&c->btree_interior_update_lock);
- INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
-
- INIT_LIST_HEAD(&c->btree_node_rewrites);
- INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
- spin_lock_init(&c->btree_node_rewrites_lock);
-}
-
-int bch2_fs_btree_interior_update_init(struct bch_fs *c)
-{
- c->btree_interior_update_worker =
- alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
- if (!c->btree_interior_update_worker)
- return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
-
- c->btree_node_rewrite_worker =
- alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
- if (!c->btree_node_rewrite_worker)
- return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
-
- if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
- sizeof(struct btree_update)))
- return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
-
- return 0;
-}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
deleted file mode 100644
index ac04e45a8515..000000000000
--- a/fs/bcachefs/btree_update_interior.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
-
-#include "btree_cache.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-
-#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
-
-#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
-
-int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
-
-#define BTREE_UPDATE_MODES() \
- x(none) \
- x(node) \
- x(root) \
- x(update)
-
-enum btree_update_mode {
-#define x(n) BTREE_UPDATE_##n,
- BTREE_UPDATE_MODES()
-#undef x
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_update {
- struct closure cl;
- struct bch_fs *c;
- u64 start_time;
- unsigned long ip_started;
-
- struct list_head list;
- struct list_head unwritten_list;
-
- enum btree_update_mode mode;
- enum bch_trans_commit_flags flags;
- unsigned nodes_written:1;
- unsigned took_gc_lock:1;
-
- enum btree_id btree_id;
- struct bpos node_start;
- struct bpos node_end;
- enum btree_node_rewrite_reason node_needed_rewrite;
- u16 node_written;
- u16 node_sectors;
- u16 node_remaining;
-
- unsigned update_level_start;
- unsigned update_level_end;
-
- struct disk_reservation disk_res;
-
- /*
- * BTREE_UPDATE_node:
- * The update that made the new nodes visible was a regular update to an
- * existing interior node - @b. We can't write out the update to @b
- * until the new nodes we created are finished writing, so we block @b
- * from writing by putting this btree_interior update on the
- * @b->write_blocked list with @write_blocked_list:
- */
- struct btree *b;
- struct list_head write_blocked_list;
-
- /*
- * We may be freeing nodes that were dirty, and thus had journal entries
- * pinned: we need to transfer the oldest of those pins to the
- * btree_update operation, and release it when the new node(s)
- * are all persistent and reachable:
- */
- struct journal_entry_pin journal;
-
- /* Preallocated nodes we reserve when we start the update: */
- struct prealloc_nodes {
- struct btree *b[BTREE_UPDATE_NODES_MAX];
- unsigned nr;
- } prealloc_nodes[2];
-
- /* Nodes being freed: */
- struct keylist old_keys;
- u64 _old_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_U64s_MAX];
-
- /* Nodes being added: */
- struct keylist new_keys;
- u64 _new_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_U64s_MAX];
-
- /* New nodes, that will be made reachable by this update: */
- struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_new_nodes;
-
- struct btree *old_nodes[BTREE_UPDATE_NODES_MAX];
- __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX];
- unsigned nr_old_nodes;
-
- open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
- BCH_REPLICAS_MAX];
- open_bucket_idx_t nr_open_buckets;
-
- unsigned journal_u64s;
- u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
-
- /* Only here to reduce stack usage on recursive splits: */
- struct keylist parent_keys;
- /*
- * Enough room for btree_split's keys without realloc - btree node
- * pointers never have crc/compression info, so we only need to acount
- * for the pointers for three keys
- */
- u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
- struct btree_trans *,
- struct btree *,
- struct bkey_format);
-
-int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
-
-int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
- unsigned, unsigned, enum btree_node_sibling);
-
-static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
- btree_path_idx_t path_idx,
- unsigned level, unsigned flags,
- enum btree_node_sibling sib)
-{
- struct btree_path *path = trans->paths + path_idx;
- struct btree *b;
-
- EBUG_ON(!btree_node_locked(path, level));
-
- if (static_branch_unlikely(&bch2_btree_node_merging_disabled))
- return 0;
-
- b = path->l[level].b;
- if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
- return 0;
-
- return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
-}
-
-static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
- btree_path_idx_t path,
- unsigned level,
- unsigned flags)
-{
- bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
- return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
- btree_prev_sib) ?:
- bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
- btree_next_sib);
-}
-
-int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
- struct btree *, unsigned, unsigned);
-int bch2_btree_node_rewrite_key(struct btree_trans *,
- enum btree_id, unsigned,
- struct bkey_i *, unsigned);
-int bch2_btree_node_rewrite_pos(struct btree_trans *,
- enum btree_id, unsigned,
- struct bpos, unsigned, unsigned);
-int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
- struct btree *, unsigned);
-
-void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-
-int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
- struct btree *, struct bkey_i *,
- unsigned, bool);
-int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
- struct bkey_i *, unsigned, bool);
-
-void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-
-int bch2_btree_root_alloc_fake_trans(struct btree_trans *, enum btree_id, unsigned);
-void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
-
-static inline unsigned btree_update_reserve_required(struct bch_fs *c,
- struct btree *b)
-{
- unsigned depth = btree_node_root(c, b)->c.level + 1;
-
- /*
- * Number of nodes we might have to allocate in a worst case btree
- * split operation - we split all the way up to the root, then allocate
- * a new root, unless we're already at max depth:
- */
- if (depth < BTREE_MAX_DEPTH)
- return (depth - b->c.level) * 2 + 1;
- else
- return (depth - b->c.level) * 2 - 1;
-}
-
-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
- b->sib_u64s[0] = b->nr.live_u64s;
- b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-static inline void *btree_data_end(struct btree *b)
-{
- return (void *) b->data + btree_buf_bytes(b);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
-{
- return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
-{
- return btree_data_end(b);
-}
-
-static inline void *write_block(struct btree *b)
-{
- return (void *) b->data + (b->written << 9);
-}
-
-static inline bool __btree_addr_written(struct btree *b, void *p)
-{
- return p < write_block(b);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
- return __btree_addr_written(b, i);
-}
-
-static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
-{
- return __btree_addr_written(b, k);
-}
-
-static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
-{
- ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
- b->whiteout_u64s;
- ssize_t total = btree_buf_bytes(b) >> 3;
-
- /* Always leave one extra u64 for bch2_varint_decode: */
- used++;
-
- return total - used;
-}
-
-static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
-{
- ssize_t remaining = __bch2_btree_u64s_remaining(b,
- btree_bkey_last(b, bset_tree_last(b)));
-
- BUG_ON(remaining < 0);
-
- if (bset_written(b, btree_bset_last(b)))
- return 0;
-
- return remaining;
-}
-
-#define BTREE_WRITE_SET_U64s_BITS 9
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
- /*
- * Could buffer up larger amounts of keys for btrees with larger keys,
- * pending benchmarking:
- */
- return 8 << BTREE_WRITE_SET_U64s_BITS;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
-{
- struct bset_tree *t = bset_tree_last(b);
- struct btree_node_entry *bne = max(write_block(b),
- (void *) btree_bkey_last(b, t));
- ssize_t remaining_space =
- __bch2_btree_u64s_remaining(b, bne->keys.start);
-
- if (unlikely(bset_written(b, bset(b, t)))) {
- if (b->written + block_sectors(c) <= btree_sectors(c))
- return bne;
- } else {
- if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
- remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
- return bne;
- }
-
- return NULL;
-}
-
-static inline void push_whiteout(struct btree *b, struct bpos pos)
-{
- struct bkey_packed k;
-
- BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
- EBUG_ON(btree_node_just_written(b));
-
- if (!bkey_pack_pos(&k, pos, b)) {
- struct bkey *u = (void *) &k;
-
- bkey_init(u);
- u->p = pos;
- }
-
- k.needs_whiteout = true;
-
- b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(b), &k);
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
-{
- if (unlikely(btree_node_need_rewrite(b)))
- return false;
-
- return u64s <= bch2_btree_keys_u64s_remaining(b);
-}
-
-void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-
-bool bch2_btree_interior_updates_flush(struct bch_fs *);
-
-void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
-struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
- struct jset_entry *, unsigned long);
-
-void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
-void bch2_do_pending_node_rewrites(struct bch_fs *);
-void bch2_free_pending_node_rewrites(struct bch_fs *);
-
-void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_btree_interior_update_exit(struct bch_fs *);
-void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
-int bch2_fs_btree_interior_update_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
deleted file mode 100644
index 4b095235a0d2..000000000000
--- a/fs/bcachefs/btree_write_buffer.c
+++ /dev/null
@@ -1,893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "disk_accounting.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *,
- struct journal_entry_pin *, u64);
-
-static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
- return (cmp_int(l->hi, r->hi) ?:
- cmp_int(l->mi, r->mi) ?:
- cmp_int(l->lo, r->lo)) >= 0;
-}
-
-static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
-{
-#ifdef CONFIG_X86_64
- int cmp;
-
- asm("mov (%[l]), %%rax;"
- "sub (%[r]), %%rax;"
- "mov 8(%[l]), %%rax;"
- "sbb 8(%[r]), %%rax;"
- "mov 16(%[l]), %%rax;"
- "sbb 16(%[r]), %%rax;"
- : "=@ccae" (cmp)
- : [l] "r" (l), [r] "r" (r)
- : "rax", "cc");
-
- EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
- return cmp;
-#else
- return __wb_key_ref_cmp(l, r);
-#endif
-}
-
-static int wb_key_seq_cmp(const void *_l, const void *_r)
-{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
-
- return cmp_int(l->journal_seq, r->journal_seq);
-}
-
-/* Compare excluding idx, the low 24 bits: */
-static inline bool wb_key_eq(const void *_l, const void *_r)
-{
- const struct wb_key_ref *l = _l;
- const struct wb_key_ref *r = _r;
-
- return !((l->hi ^ r->hi)|
- (l->mi ^ r->mi)|
- ((l->lo >> 24) ^ (r->lo >> 24)));
-}
-
-static noinline void wb_sort(struct wb_key_ref *base, size_t num)
-{
- size_t n = num, a = num / 2;
-
- if (!a) /* num < 2 || size == 0 */
- return;
-
- for (;;) {
- size_t b, c, d;
-
- if (a) /* Building heap: sift down --a */
- --a;
- else if (--n) /* Sorting: Extract root to --n */
- swap(base[0], base[n]);
- else /* Sort complete */
- break;
-
- /*
- * Sift element at "a" down into heap. This is the
- * "bottom-up" variant, which significantly reduces
- * calls to cmp_func(): we find the sift-down path all
- * the way to the leaves (one compare per level), then
- * backtrack to find where to insert the target element.
- *
- * Because elements tend to sift down close to the leaves,
- * this uses fewer compares than doing two per level
- * on the way down. (A bit more than half as many on
- * average, 3/4 worst-case.)
- */
- for (b = a; c = 2*b + 1, (d = c + 1) < n;)
- b = wb_key_ref_cmp(base + c, base + d) ? c : d;
- if (d == n) /* Special case last leaf with no sibling */
- b = c;
-
- /* Now backtrack from "b" to the correct location for "a" */
- while (b != a && wb_key_ref_cmp(base + a, base + b))
- b = (b - 1) / 2;
- c = b; /* Where "a" belongs */
- while (b != a) { /* Shift it into place */
- b = (b - 1) / 2;
- swap(base[b], base[c]);
- }
- }
-}
-
-static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_write_buffered_key *wb)
-{
- struct btree_path *path = btree_iter_path(trans, iter);
-
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-
- trans->journal_res.seq = wb->journal_seq;
-
- return bch2_trans_update(trans, iter, &wb->k,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
- struct btree_write_buffered_key *wb,
- bool *write_locked,
- bool *accounting_accumulated,
- size_t *fast)
-{
- struct btree_path *path;
- int ret;
-
- EBUG_ON(!wb->journal_seq);
- EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
- EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
-
- ret = bch2_btree_iter_traverse(trans, iter);
- if (ret)
- return ret;
-
- if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
-
- if (k.k->type == KEY_TYPE_accounting)
- bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
- bkey_s_c_to_accounting(k));
- }
- *accounting_accumulated = true;
-
- /*
- * We can't clone a path that has write locks: unshare it now, before
- * set_pos and traverse():
- */
- if (btree_iter_path(trans, iter)->ref > 1)
- iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
-
- path = btree_iter_path(trans, iter);
-
- if (!*write_locked) {
- ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
- if (ret)
- return ret;
-
- bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
- *write_locked = true;
- }
-
- if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
- *write_locked = false;
- return wb_flush_one_slowpath(trans, iter, wb);
- }
-
- EBUG_ON(!bpos_eq(wb->k.k.p, path->pos));
-
- bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
- (*fast)++;
- return 0;
-}
-
-/*
- * Update a btree with a write buffered key using the journal seq of the
- * original write buffer insert.
- *
- * It is not safe to rejournal the key once it has been inserted into the write
- * buffer because that may break recovery ordering. For example, the key may
- * have already been modified in the active write buffer in a seq that comes
- * before the current transaction. If we were to journal this key again and
- * crash, recovery would process updates in the wrong order.
- */
-static int
-btree_write_buffered_insert(struct btree_trans *trans,
- struct btree_write_buffered_key *wb)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
- BTREE_ITER_cached|BTREE_ITER_intent);
-
- trans->journal_res.seq = wb->journal_seq;
-
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, &wb->k,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
-{
- struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
- struct journal *j = &c->journal;
-
- if (!wb->inc.keys.nr)
- return;
-
- bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
- bch2_btree_write_buffer_journal_flush);
-
- darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
- darray_resize(&wb->sorted, wb->flushing.keys.size);
-
- if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
- swap(wb->flushing.keys, wb->inc.keys);
- goto out;
- }
-
- size_t nr = min(darray_room(wb->flushing.keys),
- wb->sorted.size - wb->flushing.keys.nr);
- nr = min(nr, wb->inc.keys.nr);
-
- memcpy(&darray_top(wb->flushing.keys),
- wb->inc.keys.data,
- sizeof(wb->inc.keys.data[0]) * nr);
-
- memmove(wb->inc.keys.data,
- wb->inc.keys.data + nr,
- sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
-
- wb->flushing.keys.nr += nr;
- wb->inc.keys.nr -= nr;
-out:
- if (!wb->inc.keys.nr)
- bch2_journal_pin_drop(j, &wb->inc.pin);
- else
- bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
- bch2_btree_write_buffer_journal_flush);
-
- if (j->watermark) {
- spin_lock(&j->lock);
- bch2_journal_set_watermark(j);
- spin_unlock(&j->lock);
- }
-
- BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
-}
-
-int bch2_btree_write_buffer_insert_err(struct bch_fs *c,
- enum btree_id btree, struct bkey_i *k)
-{
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
- bch2_btree_id_to_text(&buf, btree);
- prt_str(&buf, "\n");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -EROFS;
-}
-
-static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_iter iter = {};
- size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
- bool write_locked = false;
- bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
- int ret = 0;
-
- ret = bch2_journal_error(&c->journal);
- if (ret)
- return ret;
-
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
-
- mutex_lock(&wb->inc.lock);
- move_keys_from_inc_to_flushing(wb);
- mutex_unlock(&wb->inc.lock);
-
- for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
- wb->sorted.data[i].idx = i;
- wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
- memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
- }
- wb->sorted.nr = wb->flushing.keys.nr;
-
- /*
- * We first sort so that we can detect and skip redundant updates, and
- * then we attempt to flush in sorted btree order, as this is most
- * efficient.
- *
- * However, since we're not flushing in the order they appear in the
- * journal we won't be able to drop our journal pin until everything is
- * flushed - which means this could deadlock the journal if we weren't
- * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
- * if it would block taking a journal reservation.
- *
- * If that happens, simply skip the key so we can optimistically insert
- * as many keys as possible in the fast path.
- */
- wb_sort(wb->sorted.data, wb->sorted.nr);
-
- darray_for_each(wb->sorted, i) {
- struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
-
- if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
- ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k);
- goto err;
- }
-
- for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
- prefetch(&wb->flushing.keys.data[n->idx]);
-
- BUG_ON(!k->journal_seq);
-
- if (!accounting_replay_done &&
- k->k.k.type == KEY_TYPE_accounting) {
- slowpath++;
- continue;
- }
-
- if (i + 1 < &darray_top(wb->sorted) &&
- wb_key_eq(i, i + 1)) {
- struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
-
- if (k->k.k.type == KEY_TYPE_accounting &&
- n->k.k.type == KEY_TYPE_accounting)
- bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
- bkey_i_to_s_c_accounting(&k->k));
-
- overwritten++;
- n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
- k->journal_seq = 0;
- continue;
- }
-
- if (write_locked) {
- struct btree_path *path = btree_iter_path(trans, &iter);
-
- if (path->btree_id != i->btree ||
- bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- write_locked = false;
-
- ret = lockrestart_do(trans,
- bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_foreground_maybe_merge(trans, iter.path, 0,
- BCH_WATERMARK_reclaim|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc));
- if (ret)
- goto err;
- }
- }
-
- if (!iter.path || iter.btree_id != k->btree) {
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- }
-
- bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
- btree_iter_path(trans, &iter)->preserve = false;
-
- bool accounting_accumulated = false;
- do {
- if (race_fault()) {
- ret = bch_err_throw(c, journal_reclaim_would_deadlock);
- break;
- }
-
- ret = wb_flush_one(trans, &iter, k, &write_locked,
- &accounting_accumulated, &fast);
- if (!write_locked)
- bch2_trans_begin(trans);
- } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- if (!ret) {
- k->journal_seq = 0;
- } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
- slowpath++;
- ret = 0;
- } else
- break;
- }
-
- if (write_locked) {
- struct btree_path *path = btree_iter_path(trans, &iter);
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- goto err;
-
- if (slowpath) {
- /*
- * Flush in the order they were present in the journal, so that
- * we can release journal pins:
- * The fastpath zapped the seq of keys that were successfully flushed so
- * we can skip those here.
- */
- trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
-
- sort_nonatomic(wb->flushing.keys.data,
- wb->flushing.keys.nr,
- sizeof(wb->flushing.keys.data[0]),
- wb_key_seq_cmp, NULL);
-
- darray_for_each(wb->flushing.keys, i) {
- if (!i->journal_seq)
- continue;
-
- if (!accounting_replay_done &&
- i->k.k.type == KEY_TYPE_accounting) {
- could_not_insert++;
- continue;
- }
-
- if (!could_not_insert)
- bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
- bch2_btree_write_buffer_journal_flush);
-
- bch2_trans_begin(trans);
-
- ret = commit_do(trans, NULL, NULL,
- BCH_WATERMARK_reclaim|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res ,
- btree_write_buffered_insert(trans, i));
- if (ret)
- goto err;
-
- i->journal_seq = 0;
- }
-
- /*
- * If journal replay hasn't finished with accounting keys we
- * can't flush accounting keys at all - condense them and leave
- * them for next time.
- *
- * Q: Can the write buffer overflow?
- * A Shouldn't be any actual risk. It's just new accounting
- * updates that the write buffer can't flush, and those are only
- * going to be generated by interior btree node updates as
- * journal replay has to split/rewrite nodes to make room for
- * its updates.
- *
- * And for those new acounting updates, updates to the same
- * counters get accumulated as they're flushed from the journal
- * to the write buffer - see the patch for eytzingcer tree
- * accumulated. So we could only overflow if the number of
- * distinct counters touched somehow was very large.
- */
- if (could_not_insert) {
- struct btree_write_buffered_key *dst = wb->flushing.keys.data;
-
- darray_for_each(wb->flushing.keys, i)
- if (i->journal_seq)
- *dst++ = *i;
- wb->flushing.keys.nr = dst - wb->flushing.keys.data;
- }
- }
-err:
- if (ret || !could_not_insert) {
- bch2_journal_pin_drop(j, &wb->flushing.pin);
- wb->flushing.keys.nr = 0;
- }
-
- bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
- trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
- return ret;
-}
-
-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
-{
- struct journal_keys_to_wb dst;
- int ret = 0;
-
- bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
-
- for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
- jset_entry_for_each_key(entry, k) {
- ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
- if (ret)
- goto out;
- }
-
- entry->type = BCH_JSET_ENTRY_btree_keys;
- }
-out:
- ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
- return ret;
-}
-
-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
-{
- struct journal *j = &c->journal;
- struct journal_buf *buf;
- bool blocked;
- int ret = 0;
-
- while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
- ret = bch2_journal_keys_to_write_buffer(c, buf);
-
- if (!blocked && !ret) {
- spin_lock(&j->lock);
- buf->need_flush_to_write_buffer = false;
- spin_unlock(&j->lock);
- }
-
- mutex_unlock(&j->buf_lock);
-
- if (blocked) {
- bch2_journal_unblock(j);
- break;
- }
- }
-
- return ret;
-}
-
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
- bool *did_work)
-{
- struct bch_fs *c = trans->c;
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret = 0, fetch_from_journal_err;
-
- do {
- bch2_trans_unlock(trans);
-
- fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
-
- *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
-
- /*
- * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
- * is not guaranteed to empty wb->inc:
- */
- mutex_lock(&wb->flushing.lock);
- ret = bch2_btree_write_buffer_flush_locked(trans);
- mutex_unlock(&wb->flushing.lock);
- } while (!ret &&
- (fetch_from_journal_err ||
- (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
- (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
-
- return ret;
-}
-
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool did_work = false;
-
- return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
-}
-
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- bool did_work = false;
-
- trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
-
- return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
-}
-
-/*
- * The write buffer requires flushing when going RO: keys in the journal for the
- * write buffer don't have a journal pin yet
- */
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
-{
- if (bch2_journal_error(&c->journal))
- return false;
-
- bool did_work = false;
- bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
- journal_cur_seq(&c->journal), &did_work));
- return did_work;
-}
-
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret = 0;
-
- if (mutex_trylock(&wb->flushing.lock)) {
- ret = bch2_btree_write_buffer_flush_locked(trans);
- mutex_unlock(&wb->flushing.lock);
- }
-
- return ret;
-}
-
-int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
- return bch_err_throw(c, erofs_no_writes);
-
- int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
- return ret;
-}
-
-/*
- * In check and repair code, when checking references to write buffer btrees we
- * need to issue a flush before we have a definitive error: this issues a flush
- * if this is a key we haven't yet checked.
- */
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
- struct bkey_s_c referring_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf tmp;
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
-
- if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
- if (trace_write_buffer_maybe_flush_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, referring_k);
- trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
- printbuf_exit(&buf);
- }
-
- bch2_bkey_buf_reassemble(&tmp, c, referring_k);
-
- if (bkey_is_btree_ptr(referring_k.k)) {
- bch2_trans_unlock(trans);
- bch2_btree_interior_updates_flush(c);
- }
-
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- bch2_bkey_buf_copy(last_flushed, c, tmp.k);
-
- /* can we avoid the unconditional restart? */
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_);
- ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
- }
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret;
-
- mutex_lock(&wb->flushing.lock);
- do {
- ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
- } while (!ret && bch2_btree_write_buffer_should_flush(c));
- mutex_unlock(&wb->flushing.lock);
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-}
-
-static void wb_accounting_sort(struct btree_write_buffer *wb)
-{
- eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
- sizeof(wb->accounting.data[0]),
- wb_key_cmp, NULL);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
- struct bkey_i_accounting *k)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key new = { .btree = btree };
-
- bkey_copy(&new.k, &k->k_i);
-
- int ret = darray_push(&wb->accounting, new);
- if (ret)
- return ret;
-
- wb_accounting_sort(wb);
- return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
- struct journal_keys_to_wb *dst,
- enum btree_id btree, struct bkey_i *k)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- int ret;
-retry:
- ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
- if (!ret && dst->wb == &wb->flushing)
- ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
-
- if (unlikely(ret)) {
- if (dst->wb == &c->btree_write_buffer.flushing) {
- mutex_unlock(&dst->wb->lock);
- dst->wb = &c->btree_write_buffer.inc;
- bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
- bch2_btree_write_buffer_journal_flush);
- goto retry;
- }
-
- return ret;
- }
-
- dst->room = darray_room(dst->wb->keys);
- if (dst->wb == &wb->flushing)
- dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
- BUG_ON(!dst->room);
- BUG_ON(!dst->seq);
-
- struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
- wb_k->journal_seq = dst->seq;
- wb_k->btree = btree;
- bkey_copy(&wb_k->k, k);
- dst->wb->keys.nr++;
- dst->room--;
- return 0;
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- if (mutex_trylock(&wb->flushing.lock)) {
- mutex_lock(&wb->inc.lock);
- move_keys_from_inc_to_flushing(wb);
-
- /*
- * Attempt to skip wb->inc, and add keys directly to
- * wb->flushing, saving us a copy later:
- */
-
- if (!wb->inc.keys.nr) {
- dst->wb = &wb->flushing;
- } else {
- mutex_unlock(&wb->flushing.lock);
- dst->wb = &wb->inc;
- }
- } else {
- mutex_lock(&wb->inc.lock);
- dst->wb = &wb->inc;
- }
-
- dst->room = darray_room(dst->wb->keys);
- if (dst->wb == &wb->flushing)
- dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
- dst->seq = seq;
-
- bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
- bch2_btree_write_buffer_journal_flush);
-
- darray_for_each(wb->accounting, i)
- memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
-}
-
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- unsigned live_accounting_keys = 0;
- int ret = 0;
-
- darray_for_each(wb->accounting, i)
- if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
- i->journal_seq = dst->seq;
- live_accounting_keys++;
- ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
- if (ret)
- break;
- }
-
- if (live_accounting_keys * 2 < wb->accounting.nr) {
- struct btree_write_buffered_key *dst = wb->accounting.data;
-
- darray_for_each(wb->accounting, src)
- if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
- *dst++ = *src;
- wb->accounting.nr = dst - wb->accounting.data;
- wb_accounting_sort(wb);
- }
-
- if (!dst->wb->keys.nr)
- bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
-
- if (bch2_btree_write_buffer_should_flush(c) &&
- __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
- !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
-
- if (dst->wb == &wb->flushing)
- mutex_unlock(&wb->flushing.lock);
- mutex_unlock(&wb->inc.lock);
-
- return ret;
-}
-
-static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
-{
- if (wb->keys.size >= new_size)
- return 0;
-
- if (!mutex_trylock(&wb->lock))
- return -EINTR;
-
- int ret = darray_resize(&wb->keys, new_size);
- mutex_unlock(&wb->lock);
- return ret;
-}
-
-int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- return wb_keys_resize(&wb->flushing, new_size) ?:
- wb_keys_resize(&wb->inc, new_size);
-}
-
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
- !bch2_journal_error(&c->journal));
-
- darray_exit(&wb->accounting);
- darray_exit(&wb->sorted);
- darray_exit(&wb->flushing.keys);
- darray_exit(&wb->inc.keys);
-}
-
-void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- mutex_init(&wb->inc.lock);
- mutex_init(&wb->flushing.lock);
- INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
-}
-
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- /* Will be resized by journal as needed: */
- unsigned initial_size = 1 << 16;
-
- return darray_make_room(&wb->inc.keys, initial_size) ?:
- darray_make_room(&wb->flushing.keys, initial_size) ?:
- darray_make_room(&wb->sorted, initial_size);
-}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
deleted file mode 100644
index c351d21aca0b..000000000000
--- a/fs/bcachefs/btree_write_buffer.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-
-#include "bkey.h"
-#include "disk_accounting.h"
-
-static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
-}
-
-static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
-}
-
-struct btree_trans;
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
-int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
-int bch2_btree_write_buffer_tryflush(struct btree_trans *);
-
-struct bkey_buf;
-int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
-
-struct journal_keys_to_wb {
- struct btree_write_buffer_keys *wb;
- size_t room;
- u64 seq;
-};
-
-static inline int wb_key_cmp(const void *_l, const void *_r)
-{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
-
- return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
-}
-
-int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
- enum btree_id, struct bkey_i_accounting *);
-
-static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
- enum btree_id btree, struct bkey_i_accounting *k)
-{
- struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key search;
- search.btree = btree;
- search.k.k.p = k->k.p;
-
- unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
- sizeof(wb->accounting.data[0]),
- wb_key_cmp, &search);
-
- if (idx >= wb->accounting.nr)
- return bch2_accounting_key_to_wb_slowpath(c, btree, k);
-
- struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
- bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
- return 0;
-}
-
-int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
- struct journal_keys_to_wb *,
- enum btree_id, struct bkey_i *);
-
-static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
- struct journal_keys_to_wb *dst,
- enum btree_id btree, struct bkey_i *k)
-{
- if (unlikely(!dst->room))
- return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
-
- struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
- wb_k->journal_seq = dst->seq;
- wb_k->btree = btree;
- bkey_copy(&wb_k->k, k);
- dst->wb->keys.nr++;
- dst->room--;
- return 0;
-}
-
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
- struct journal_keys_to_wb *dst,
- enum btree_id btree, struct bkey_i *k)
-{
- if (unlikely(!btree_type_uses_write_buffer(btree))) {
- int ret = bch2_btree_write_buffer_insert_err(c, btree, k);
- dump_stack();
- return ret;
- }
-
- EBUG_ON(!dst->seq);
-
- return k->k.type == KEY_TYPE_accounting
- ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
- : __bch2_journal_key_to_wb(c, dst, btree, k);
-}
-
-void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
-
-int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
-void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
-void bch2_fs_btree_write_buffer_init_early(struct bch_fs *);
-int bch2_fs_btree_write_buffer_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
deleted file mode 100644
index e9e76e20f43b..000000000000
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
-
-#include "darray.h"
-#include "journal_types.h"
-
-#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
-#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-
-struct wb_key_ref {
-union {
- struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- unsigned idx:24;
- u8 pos[sizeof(struct bpos)];
- enum btree_id btree:8;
-#else
- enum btree_id btree:8;
- u8 pos[sizeof(struct bpos)];
- unsigned idx:24;
-#endif
- } __packed;
- struct {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- u64 lo;
- u64 mi;
- u64 hi;
-#else
- u64 hi;
- u64 mi;
- u64 lo;
-#endif
- };
-};
-};
-
-struct btree_write_buffered_key {
- enum btree_id btree:8;
- u64 journal_seq:56;
- __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-struct btree_write_buffer_keys {
- DARRAY(struct btree_write_buffered_key) keys;
- struct journal_entry_pin pin;
- struct mutex lock;
-};
-
-struct btree_write_buffer {
- DARRAY(struct wb_key_ref) sorted;
- struct btree_write_buffer_keys inc;
- struct btree_write_buffer_keys flushing;
- struct work_struct flush_work;
-
- DARRAY(struct btree_write_buffered_key) accounting;
-};
-
-#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
deleted file mode 100644
index f25903c10e8a..000000000000
--- a/fs/bcachefs/buckets.c
+++ /dev/null
@@ -1,1395 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "buckets_waiting_for_journal.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "error.h"
-#include "inode.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/preempt.h>
-
-void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
-{
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
-}
-
-void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
-{
- memset(usage, 0, sizeof(*usage));
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
- sizeof(struct bch_dev_usage_full) / sizeof(u64));
-}
-
-static u64 reserve_factor(u64 r)
-{
- return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
-}
-
-static struct bch_fs_usage_short
-__bch2_fs_usage_read_short(struct bch_fs *c)
-{
- struct bch_fs_usage_short ret;
- u64 data, reserved;
-
- ret.capacity = c->capacity -
- percpu_u64_get(&c->usage->hidden);
-
- data = percpu_u64_get(&c->usage->data) +
- percpu_u64_get(&c->usage->btree);
- reserved = percpu_u64_get(&c->usage->reserved) +
- percpu_u64_get(c->online_reserved);
-
- ret.used = min(ret.capacity, data + reserve_factor(reserved));
- ret.free = ret.capacity - ret.used;
-
- ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes);
-
- return ret;
-}
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *c)
-{
- struct bch_fs_usage_short ret;
-
- percpu_down_read(&c->mark_lock);
- ret = __bch2_fs_usage_read_short(c);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *out,
- struct bch_dev *ca,
- struct bch_dev_usage_full *usage)
-{
- if (out->nr_tabstops < 5) {
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- }
-
- prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++) {
- bch2_prt_data_type(out, i);
- prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
- usage->d[i].buckets,
- usage->d[i].sectors,
- usage->d[i].fragmented);
- }
-
- prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
-}
-
-static int bch2_check_fix_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- bool *do_update)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (!ca) {
- if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
- trans, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
- return 0;
- }
-
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- if (!g) {
- if (fsck_err(trans, ptr_to_invalid_device,
- "pointer to invalid bucket on device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
- goto out;
- }
-
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
-
- if (fsck_err_on(!g->gen_valid,
- trans, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- /* this pointer will be dropped */
- *do_update = true;
- goto out;
- }
- }
-
- /* g->gen_valid == true */
-
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- trans, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached &&
- (g->data_type != BCH_DATA_btree ||
- data_type == BCH_DATA_btree)) {
- g->data_type = data_type;
- g->stripe_sectors = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- }
-
- *do_update = true;
- }
-
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- trans, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
-
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- trans, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
-
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- goto out;
-
- if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
- trans, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached &&
- data_type == BCH_DATA_btree) {
- switch (g->data_type) {
- case BCH_DATA_sb:
- bch_err(c, "btree and superblock in the same bucket - cannot repair");
- ret = bch_err_throw(c, fsck_repair_unimplemented);
- goto out;
- case BCH_DATA_journal:
- ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
- bch_err_msg(c, ret, "error deleting journal bucket %zu",
- PTR_BUCKET_NR(ca, &p.ptr));
- if (ret)
- goto out;
- break;
- }
-
- g->data_type = data_type;
- g->stripe_sectors = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- *do_update = true;
- }
- }
-
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive,
- trans, ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
- trans, ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
- }
-out:
-fsck_err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_fix_ptrs(struct btree_trans *trans,
- enum btree_id btree, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- /* We don't yet do btree key updates correctly for when we're RW */
- BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
- bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
- ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
- if (ret)
- goto err;
- }
-
- if (do_update) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- scoped_guard(rcu)
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
-
- if (level) {
- /*
- * We don't want to drop btree node pointers - if the
- * btree node isn't there anymore, the read path will
- * sort it out:
- */
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- scoped_guard(rcu)
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
- }
- } else {
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
-
- rcu_read_lock();
-restart_drop_ptrs:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
-
- if ((p.ptr.cached &&
- (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
- (!p.ptr.cached &&
- gen_cmp(p.ptr.gen, g->gen) < 0) ||
- gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
- (g->data_type &&
- g->data_type != data_type)) {
- bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
- goto restart_drop_ptrs;
- }
- }
- rcu_read_unlock();
-again:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_extent_entry_for_each(ptrs, entry) {
- if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
- entry->stripe_ptr.idx);
- union bch_extent_entry *next_ptr;
-
- bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
- if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
- goto found;
- next_ptr = NULL;
-found:
- if (!next_ptr) {
- bch_err(c, "aieee, found stripe ptr with no data ptr");
- continue;
- }
-
- if (!m || !m->alive ||
- !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
- &next_ptr->ptr,
- m->sectors)) {
- bch2_bkey_extent_entry_drop(new, entry);
- goto again;
- }
- }
- }
- }
-
- if (0) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- bch_info(c, "updated %s", buf.buf);
-
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf.buf);
- }
-
- if (!(flags & BTREE_TRIGGER_is_root)) {
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun);
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
-
- if (level)
- bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
- } else {
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
- jset_u64s(new->k.u64s));
- ret = PTR_ERR_OR_ZERO(e);
- if (ret)
- goto err;
-
- journal_entry_set(e,
- BCH_JSET_ENTRY_btree_root,
- btree, level - 1,
- new, new->k.u64s);
-
- /*
- * no locking, we're single threaded and not rw yet, see
- * the big assertino above that we repeat here:
- */
- BUG_ON(test_bit(BCH_FS_rw, &c->flags));
-
- struct btree *b = bch2_btree_id_root(c, btree)->b;
- bkey_copy(&b->key, new);
- }
- }
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf,
- struct bkey_s_c k, bool insert, enum bch_sb_error_id id)
-{
- struct bch_fs *c = trans->c;
-
- prt_printf(buf, "\nwhile marking ");
- bch2_bkey_val_to_text(buf, c, k);
- prt_newline(buf);
-
- bool print = __bch2_count_fsck_err(c, id, buf);
-
- int ret = bch2_run_explicit_recovery_pass(c, buf,
- BCH_RECOVERY_PASS_check_allocations, 0);
-
- if (insert) {
- bch2_trans_updates_to_text(buf, trans);
- __bch2_inconsistent_error(c, buf);
- /*
- * If we're in recovery, run_explicit_recovery_pass might give
- * us an error code for rewinding recovery
- */
- if (!ret)
- ret = bch_err_throw(c, bucket_ref_update);
- } else {
- /* Always ignore overwrite errors, so that deletion works */
- ret = 0;
- }
-
- if (print || insert)
- bch2_print_str(c, KERN_ERR, buf->buf);
- return ret;
-}
-
-int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 *bucket_sectors)
-{
- struct bch_fs *c = trans->c;
- size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
- struct printbuf buf = PRINTBUF;
- bool inserting = sectors > 0;
- int ret = 0;
-
- BUG_ON(!sectors);
-
- if (unlikely(gen_after(ptr->gen, b_gen))) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf,
- "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen);
-
- ret = bucket_ref_update_err(trans, &buf, k, inserting,
- BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen);
- goto out;
- }
-
- if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen);
-
- ret = bucket_ref_update_err(trans, &buf, k, inserting,
- BCH_FSCK_ERR_ptr_too_stale);
- goto out;
- }
-
- if (b_gen != ptr->gen && ptr->cached) {
- ret = 1;
- goto out;
- }
-
- if (unlikely(b_gen != ptr->gen)) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf,
- "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)",
- ptr->dev, bucket_nr, b_gen,
- bucket_gen_get(ca, bucket_nr),
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen);
-
- ret = bucket_ref_update_err(trans, &buf, k, inserting,
- BCH_FSCK_ERR_stale_dirty_ptr);
- goto out;
- }
-
- if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type),
- bch2_data_type_str(ptr_data_type));
-
- ret = bucket_ref_update_err(trans, &buf, k, inserting,
- BCH_FSCK_ERR_ptr_bucket_data_type_mismatch);
- goto out;
- }
-
- if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) {
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- *bucket_sectors, sectors);
-
- ret = bucket_ref_update_err(trans, &buf, k, inserting,
- BCH_FSCK_ERR_bucket_sector_count_overflow);
- sectors = -*bucket_sectors;
- goto out;
- }
-
- *bucket_sectors += sectors;
-out:
- printbuf_exit(&buf);
- return ret;
-}
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- static int warned_disk_usage = 0;
- bool warn = false;
-
- percpu_down_read(&c->mark_lock);
- struct bch_fs_usage_base *src = &trans->fs_usage_delta;
-
- s64 added = src->btree + src->data + src->reserved;
-
- /*
- * Not allowed to reduce sectors_available except by getting a
- * reservation:
- */
- s64 should_not_have_added = added - (s64) disk_res_sectors;
- if (unlikely(should_not_have_added > 0)) {
- u64 old, new;
-
- old = atomic64_read(&c->sectors_available);
- do {
- new = max_t(s64, 0, old - should_not_have_added);
- } while (!atomic64_try_cmpxchg(&c->sectors_available,
- &old, new));
-
- added -= should_not_have_added;
- warn = true;
- }
-
- if (added > 0) {
- trans->disk_res->sectors -= added;
- this_cpu_sub(*c->online_reserved, added);
- }
-
- preempt_disable();
- struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
- acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-
- if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
- bch2_trans_inconsistent(trans,
- "disk usage increased %lli more than %llu sectors reserved)",
- should_not_have_added, disk_res_sectors);
-}
-
-/* KEY_TYPE_extent: */
-
-static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
- struct bkey_s_c k,
- const struct extent_ptr_decoded *p,
- s64 sectors, enum bch_data_type ptr_data_type,
- struct bch_alloc_v4 *a,
- bool insert)
-{
- u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
- !p->ptr.cached ? &a->dirty_sectors :
- &a->cached_sectors;
- int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
- a->gen, a->data_type, dst_sectors);
-
- if (ret)
- return ret;
- if (insert)
- alloc_data_type_set(a, ptr_data_type);
- return 0;
-}
-
-static int bch2_trigger_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- const union bch_extent_entry *entry,
- s64 *sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- bool insert = !(flags & BTREE_TRIGGER_overwrite);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
-
- *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
-
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (unlikely(!ca)) {
- if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = bch_err_throw(c, trigger_pointer);
- goto err;
- }
-
- struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
- if (!bucket_valid(ca, bucket.offset)) {
- if (insert) {
- bch2_dev_bucket_missing(ca, bucket.offset);
- ret = bch_err_throw(c, trigger_pointer);
- }
- goto err;
- }
-
- if (flags & BTREE_TRIGGER_transactional) {
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
- if (ret)
- goto err;
-
- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
- if (ret)
- goto err;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- struct bucket *g = gc_bucket(ca, bucket.offset);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
- p.ptr.dev,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch_err_throw(c, trigger_pointer);
- goto err;
- }
-
- bucket_lock(g);
- struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
- alloc_to_bucket(g, new);
- bucket_unlock(g);
-
- if (!ret)
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
- }
-err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- enum bch_data_type data_type,
- s64 sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
-
- if (flags & BTREE_TRIGGER_transactional) {
- struct btree_iter iter;
- struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_with_updates, stripe);
- int ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
- "pointer to nonexistent stripe %llu",
- (u64) p.ec.idx);
- goto err;
- }
-
- if (!bch2_ptr_matches_stripe(&s->v, p)) {
- bch2_trans_inconsistent(trans,
- "stripe pointer doesn't match stripe %llu",
- (u64) p.ec.idx);
- ret = bch_err_throw(c, trigger_stripe_pointer);
- goto err;
- }
-
- stripe_blockcount_set(&s->v, p.ec.block,
- stripe_blockcount_get(&s->v, p.ec.block) +
- sectors);
-
- struct disk_accounting_pos acc;
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
- acc.replicas.data_type = data_type;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- (u64) p.ec.idx);
- return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
- }
-
- gc_stripe_lock(m);
-
- if (!m || !m->alive) {
- gc_stripe_unlock(m);
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ",
- (u64) p.ec.idx);
- bch2_bkey_val_to_text(&buf, c, k);
- __bch2_inconsistent_error(c, &buf);
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- return bch_err_throw(c, trigger_stripe_pointer);
- }
-
- m->block_sectors[p.ec.block] += sectors;
-
- struct disk_accounting_pos acc;
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
- gc_stripe_unlock(m);
-
- acc.replicas.data_type = data_type;
- int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int __trigger_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- bool gc = flags & BTREE_TRIGGER_gc;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
- ? BCH_DATA_btree
- : BCH_DATA_user;
- int ret = 0;
-
- s64 replicas_sectors = 0;
-
- struct disk_accounting_pos acc_replicas_key;
- memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
- acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
- acc_replicas_key.replicas.data_type = data_type;
- acc_replicas_key.replicas.nr_devs = 0;
- acc_replicas_key.replicas.nr_required = 1;
-
- unsigned cur_compression_type = 0;
- u64 compression_acct[3] = { 1, 0, 0 };
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = 0;
- ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
- if (ret < 0)
- return ret;
-
- bool stale = ret > 0;
-
- if (p.ptr.cached && stale)
- continue;
-
- if (p.ptr.cached) {
- ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
- if (ret)
- return ret;
- } else if (!p.has_ec) {
- replicas_sectors += disk_sectors;
- replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
- } else {
- ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
- if (ret)
- return ret;
-
- /*
- * There may be other dirty pointers in this extent, but
- * if so they're not required for mounting if we have an
- * erasure coded pointer in this extent:
- */
- acc_replicas_key.replicas.nr_required = 0;
- }
-
- if (cur_compression_type &&
- cur_compression_type != p.crc.compression_type) {
- if (flags & BTREE_TRIGGER_overwrite)
- bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
- ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
- compression, cur_compression_type);
- if (ret)
- return ret;
-
- compression_acct[0] = 1;
- compression_acct[1] = 0;
- compression_acct[2] = 0;
- }
-
- cur_compression_type = p.crc.compression_type;
- if (p.crc.compression_type) {
- compression_acct[1] += p.crc.uncompressed_size;
- compression_acct[2] += p.crc.compressed_size;
- }
- }
-
- if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
- if (ret)
- return ret;
- }
-
- if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
- if (ret)
- return ret;
- }
-
- if (cur_compression_type) {
- if (flags & BTREE_TRIGGER_overwrite)
- bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
-
- ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
- compression, cur_compression_type);
- if (ret)
- return ret;
- }
-
- if (level) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
- if (ret)
- return ret;
- } else {
- bool insert = !(flags & BTREE_TRIGGER_overwrite);
-
- s64 v[3] = {
- insert ? 1 : -1,
- insert ? k.k->size : -((s64) k.k->size),
- replicas_sectors,
- };
- ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_trigger_extent(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
- struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
- unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
- unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
-
- if (unlikely(flags & BTREE_TRIGGER_check_repair))
- return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
-
- /* if pointers aren't changing - nothing to do: */
- if (new_ptrs_bytes == old_ptrs_bytes &&
- !memcmp(new_ptrs.start,
- old_ptrs.start,
- new_ptrs_bytes))
- return 0;
-
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- if (old.k->type) {
- int ret = __trigger_extent(trans, btree, level, old,
- flags & ~BTREE_TRIGGER_insert);
- if (ret)
- return ret;
- }
-
- if (new.k->type) {
- int ret = __trigger_extent(trans, btree, level, new.s_c,
- flags & ~BTREE_TRIGGER_overwrite);
- if (ret)
- return ret;
- }
-
- int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta[1] = { 0 };
-
- s64 s = bch2_bkey_sectors_need_rebalance(c, old);
- need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta[0] -= s;
-
- s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
- need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta[0] += s;
-
- if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, need_rebalance_delta > 0);
- if (ret)
- return ret;
- }
-
- if (need_rebalance_sectors_delta[0]) {
- int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- need_rebalance_sectors_delta, rebalance_work);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-/* KEY_TYPE_reservation */
-
-static int __trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 sectors[1] = { k.k->size };
-
- if (flags & BTREE_TRIGGER_overwrite)
- sectors[0] = -sectors[0];
-
- return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
- persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
- }
-
- return 0;
-}
-
-int bch2_trigger_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
-}
-
-/* Mark superblocks: */
-
-static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- enum bch_data_type type,
- unsigned sectors)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- int ret = 0;
-
- struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- if (a->v.data_type && type && a->v.data_type != type) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s\n",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- bch2_data_type_str(type),
- bch2_data_type_str(type));
-
- bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
-
- ret = bch2_run_explicit_recovery_pass(c, &buf,
- BCH_RECOVERY_PASS_check_allocations, 0);
-
- /* Always print, this is always fatal */
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- if (!ret)
- ret = bch_err_throw(c, metadata_bucket_inconsistency);
- goto err;
- }
-
- if (a->v.data_type != type ||
- a->v.dirty_sectors != sectors) {
- a->v.data_type = type;
- a->v.dirty_sectors = sectors;
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
- u64 b, enum bch_data_type data_type, unsigned sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- struct bucket *g = gc_bucket(ca, b);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
- ca->dev_idx, bch2_data_type_str(data_type)))
- goto err;
-
- bucket_lock(g);
- struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
-
- if (bch2_fs_inconsistent_on(g->data_type &&
- g->data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type)))
- goto err_unlock;
-
- if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
- "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
- ca->dev_idx, b, g->gen,
- bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors))
- goto err_unlock;
-
- g->data_type = data_type;
- g->dirty_sectors += sectors;
- struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
- bucket_unlock(g);
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
- return ret;
-err_unlock:
- bucket_unlock(g);
-err:
- return bch_err_throw(c, metadata_bucket_inconsistency);
-}
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- enum bch_data_type type, unsigned sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- BUG_ON(type != BCH_DATA_free &&
- type != BCH_DATA_sb &&
- type != BCH_DATA_journal);
-
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return 0;
-
- if (flags & BTREE_TRIGGER_gc)
- return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
- else if (flags & BTREE_TRIGGER_transactional)
- return commit_do(trans, NULL, NULL, 0,
- __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
- else
- BUG();
-}
-
-static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
- struct bch_dev *ca, u64 start, u64 end,
- enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
- enum btree_iter_update_trigger_flags flags)
-{
- do {
- u64 b = sector_to_bucket(ca, start);
- unsigned sectors =
- min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
-
- if (b != *bucket && *bucket_sectors) {
- int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
- type, *bucket_sectors, flags);
- if (ret)
- return ret;
-
- *bucket_sectors = 0;
- }
-
- *bucket = b;
- *bucket_sectors += sectors;
- start += sectors;
- } while (start < end);
-
- return 0;
-}
-
-static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_layout layout = ca->disk_sb.sb->layout;
- mutex_unlock(&c->sb_lock);
-
- u64 bucket = 0;
- unsigned i, bucket_sectors = 0;
- int ret;
-
- for (i = 0; i < layout.nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout.sb_offset[i]);
-
- if (offset == BCH_SB_SECTOR) {
- ret = bch2_trans_mark_metadata_sectors(trans, ca,
- 0, BCH_SB_SECTOR,
- BCH_DATA_sb, &bucket, &bucket_sectors, flags);
- if (ret)
- return ret;
- }
-
- ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
- offset + (1 << layout.sb_max_size_bits),
- BCH_DATA_sb, &bucket, &bucket_sectors, flags);
- if (ret)
- return ret;
- }
-
- if (bucket_sectors) {
- ret = bch2_trans_mark_metadata_bucket(trans, ca,
- bucket, BCH_DATA_sb, bucket_sectors, flags);
- if (ret)
- return ret;
- }
-
- for (i = 0; i < ca->journal.nr; i++) {
- ret = bch2_trans_mark_metadata_bucket(trans, ca,
- ca->journal.buckets[i],
- BCH_DATA_journal, ca->mi.bucket_size, flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
- enum btree_iter_update_trigger_flags flags)
-{
- int ret = bch2_trans_run(c,
- __bch2_trans_mark_dev_sb(trans, ca, flags));
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
- enum btree_iter_update_trigger_flags flags)
-{
- for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) {
- int ret = bch2_trans_mark_dev_sb(c, ca, flags);
- if (ret) {
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs);
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_trans_mark_dev_sbs(struct bch_fs *c)
-{
- return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
-}
-
-bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- u64 b_offset = bucket_to_sector(ca, b);
- u64 b_end = bucket_to_sector(ca, b + 1);
- unsigned i;
-
- if (!b)
- return true;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
- u64 end = offset + (1 << layout->sb_max_size_bits);
-
- if (!(offset >= b_end || end <= b_offset))
- return true;
- }
-
- for (i = 0; i < ca->journal.nr; i++)
- if (b == ca->journal.buckets[i])
- return true;
-
- return false;
-}
-
-/* Disk reservations: */
-
-#define SECTORS_CACHE 1024
-
-int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, enum bch_reservation_flags flags)
-{
- struct bch_fs_pcpu *pcpu;
- u64 old, get;
- u64 sectors_available;
- int ret;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
- pcpu = this_cpu_ptr(c->pcpu);
-
- if (sectors <= pcpu->sectors_available)
- goto out;
-
- old = atomic64_read(&c->sectors_available);
- do {
- get = min((u64) sectors + SECTORS_CACHE, old);
-
- if (get < sectors) {
- preempt_enable();
- goto recalculate;
- }
- } while (!atomic64_try_cmpxchg(&c->sectors_available,
- &old, old - get));
-
- pcpu->sectors_available += get;
-
-out:
- pcpu->sectors_available -= sectors;
- this_cpu_add(*c->online_reserved, sectors);
- res->sectors += sectors;
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
- return 0;
-
-recalculate:
- mutex_lock(&c->sectors_available_lock);
-
- percpu_u64_set(&c->pcpu->sectors_available, 0);
- sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
-
- if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
- sectors = min(sectors, sectors_available);
-
- if (sectors <= sectors_available ||
- (flags & BCH_DISK_RESERVATION_NOFAIL)) {
- atomic64_set(&c->sectors_available,
- max_t(s64, 0, sectors_available - sectors));
- this_cpu_add(*c->online_reserved, sectors);
- res->sectors += sectors;
- ret = 0;
- } else {
- atomic64_set(&c->sectors_available, sectors_available);
- ret = bch_err_throw(c, ENOSPC_disk_reservation);
- }
-
- mutex_unlock(&c->sectors_available_lock);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-/* Startup/shutdown: */
-
-void bch2_buckets_nouse_free(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- kvfree_rcu_mightsleep(ca->buckets_nouse);
- ca->buckets_nouse = NULL;
- }
-}
-
-int bch2_buckets_nouse_alloc(struct bch_fs *c)
-{
- for_each_member_device(c, ca) {
- BUG_ON(ca->buckets_nouse);
-
- ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
- sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets_nouse) {
- bch2_dev_put(ca);
- return bch_err_throw(c, ENOMEM_buckets_nouse);
- }
- }
-
- return 0;
-}
-
-static void bucket_gens_free_rcu(struct rcu_head *rcu)
-{
- struct bucket_gens *buckets =
- container_of(rcu, struct bucket_gens, rcu);
-
- kvfree(buckets);
-}
-
-int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
- struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
- bool resize = ca->bucket_gens != NULL;
- int ret;
-
- if (resize)
- lockdep_assert_held(&c->state_lock);
-
- if (resize && ca->buckets_nouse)
- return bch_err_throw(c, no_resize_with_buckets_nouse);
-
- bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
- GFP_KERNEL|__GFP_ZERO);
- if (!bucket_gens) {
- ret = bch_err_throw(c, ENOMEM_bucket_gens);
- goto err;
- }
-
- bucket_gens->first_bucket = ca->mi.first_bucket;
- bucket_gens->nbuckets = nbuckets;
- bucket_gens->nbuckets_minus_first =
- bucket_gens->nbuckets - bucket_gens->first_bucket;
-
- old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
-
- if (resize) {
- u64 copy = min(bucket_gens->nbuckets,
- old_bucket_gens->nbuckets);
- memcpy(bucket_gens->b,
- old_bucket_gens->b,
- sizeof(bucket_gens->b[0]) * copy);
- }
-
- ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
- ca->mi.nbuckets, nbuckets) ?:
- bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
- ca->mi.nbuckets, nbuckets);
-
- rcu_assign_pointer(ca->bucket_gens, bucket_gens);
- bucket_gens = old_bucket_gens;
-
- nbuckets = ca->mi.nbuckets;
-
- ret = 0;
-err:
- if (bucket_gens)
- call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-
- return ret;
-}
-
-void bch2_dev_buckets_free(struct bch_dev *ca)
-{
- kvfree(ca->buckets_nouse);
- kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
- free_percpu(ca->usage);
-}
-
-int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- ca->usage = alloc_percpu(struct bch_dev_usage_full);
- if (!ca->usage)
- return bch_err_throw(c, ENOMEM_usage_init);
-
- return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
-}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
deleted file mode 100644
index 49a3807a5eab..000000000000
--- a/fs/bcachefs/buckets.h
+++ /dev/null
@@ -1,369 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#ifndef _BUCKETS_H
-#define _BUCKETS_H
-
-#include "buckets_types.h"
-#include "extents.h"
-#include "sb-members.h"
-
-static inline u64 sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
- return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
- return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
- u32 remainder;
-
- div_u64_rem(s, ca->mi.bucket_size, &remainder);
- return remainder;
-}
-
-static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, u32 *offset)
-{
- return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-#define for_each_bucket(_b, _buckets) \
- for (_b = (_buckets)->b + (_buckets)->first_bucket; \
- _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-
-static inline void bucket_unlock(struct bucket *b)
-{
- BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
- clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
- smp_mb__after_atomic();
- wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void bucket_lock(struct bucket *b)
-{
- wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
- TASK_UNINTERRUPTIBLE);
-}
-
-static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
-{
- return bucket_valid(ca, b)
- ? genradix_ptr(&ca->buckets_gc, b)
- : NULL;
-}
-
-static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
-{
- return rcu_dereference_check(ca->bucket_gens,
- lockdep_is_held(&ca->fs->state_lock));
-}
-
-static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
-{
- struct bucket_gens *gens = bucket_gens(ca);
-
- if (b - gens->first_bucket >= gens->nbuckets_minus_first)
- return NULL;
- return gens->b + b;
-}
-
-static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
-{
- u8 *gen = bucket_gen(ca, b);
- return gen ? *gen : -1;
-}
-
-static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
-{
- guard(rcu)();
- return bucket_gen_get_rcu(ca, b);
-}
-
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return sector_to_bucket(ca, ptr->offset);
-}
-
-static inline struct bpos PTR_BUCKET_POS(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- u32 *bucket_offset)
-{
- return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
-}
-
-static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
-static inline enum bch_data_type ptr_data_type(const struct bkey *k,
- const struct bch_extent_ptr *ptr)
-{
- if (bkey_is_btree_ptr(k))
- return BCH_DATA_btree;
-
- return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
-}
-
-static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
- EBUG_ON(sectors < 0);
-
- return crc_is_compressed(p.crc)
- ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
- p.crc.uncompressed_size)
- : sectors;
-}
-
-static inline int gen_cmp(u8 a, u8 b)
-{
- return (s8) (a - b);
-}
-
-static inline int gen_after(u8 a, u8 b)
-{
- return max(0, gen_cmp(a, b));
-}
-
-static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
- int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
- return gen < 0 ? gen : gen_after(gen, ptr->gen);
-}
-
-/**
- * dev_ptr_stale() - check if a pointer points into a bucket that has been
- * invalidated.
- */
-static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
- guard(rcu)();
- return dev_ptr_stale_rcu(ca, ptr);
-}
-
-/* Device usage: */
-
-void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
-static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
-{
- struct bch_dev_usage ret;
-
- bch2_dev_usage_read_fast(ca, &ret);
- return ret;
-}
-
-void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
-static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
-{
- struct bch_dev_usage_full ret;
-
- bch2_dev_usage_full_read_fast(ca, &ret);
- return ret;
-}
-
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
-
-static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
-{
- s64 reserved = 0;
-
- switch (watermark) {
- case BCH_WATERMARK_NR:
- BUG();
- case BCH_WATERMARK_stripe:
- reserved += ca->mi.nbuckets >> 6;
- fallthrough;
- case BCH_WATERMARK_normal:
- reserved += ca->mi.nbuckets >> 6;
- fallthrough;
- case BCH_WATERMARK_copygc:
- reserved += ca->nr_btree_reserve;
- fallthrough;
- case BCH_WATERMARK_btree:
- reserved += ca->nr_btree_reserve;
- fallthrough;
- case BCH_WATERMARK_btree_copygc:
- case BCH_WATERMARK_reclaim:
- case BCH_WATERMARK_interior_updates:
- break;
- }
-
- return reserved;
-}
-
-static inline u64 dev_buckets_free(struct bch_dev *ca,
- struct bch_dev_usage usage,
- enum bch_watermark watermark)
-{
- return max_t(s64, 0,
- usage.buckets[BCH_DATA_free]-
- ca->nr_open_buckets -
- bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage usage,
- enum bch_watermark watermark)
-{
- return max_t(s64, 0,
- usage.buckets[BCH_DATA_free]
- + usage.buckets[BCH_DATA_cached]
- + usage.buckets[BCH_DATA_need_gc_gens]
- + usage.buckets[BCH_DATA_need_discard]
- - ca->nr_open_buckets
- - bch2_dev_buckets_reserved(ca, watermark));
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca,
- enum bch_watermark watermark)
-{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
-}
-
-/* Filesystem usage: */
-
-struct bch_fs_usage_short
-bch2_fs_usage_read_short(struct bch_fs *);
-
-int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
- struct bkey_s_c, const struct bch_extent_ptr *,
- s64, enum bch_data_type, u8, u8, u32 *);
-
-int bch2_check_fix_ptrs(struct btree_trans *,
- enum btree_id, unsigned, struct bkey_s_c,
- enum btree_iter_update_trigger_flags);
-
-int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
-({ \
- int ret = 0; \
- \
- if (_old.k->type) \
- ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_insert); \
- if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_overwrite);\
- ret; \
-})
-
-void bch2_trans_account_disk_usage_change(struct btree_trans *);
-
-int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
- enum bch_data_type, unsigned,
- enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *,
- enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
- enum btree_iter_update_trigger_flags);
-int bch2_trans_mark_dev_sbs(struct bch_fs *);
-
-bool bch2_is_superblock_bucket(struct bch_dev *, u64);
-
-static inline const char *bch2_data_type_str(enum bch_data_type type)
-{
- return type < BCH_DATA_NR
- ? __bch2_data_types[type]
- : "(invalid data type)";
-}
-
-/* disk reservations: */
-
-static inline void bch2_disk_reservation_put(struct bch_fs *c,
- struct disk_reservation *res)
-{
- if (res->sectors) {
- this_cpu_sub(*c->online_reserved, res->sectors);
- res->sectors = 0;
- }
-}
-
-enum bch_reservation_flags {
- BCH_DISK_RESERVATION_NOFAIL = 1 << 0,
- BCH_DISK_RESERVATION_PARTIAL = 1 << 1,
-};
-
-int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
- u64, enum bch_reservation_flags);
-
-static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, enum bch_reservation_flags flags)
-{
-#ifdef __KERNEL__
- u64 old, new;
-
- old = this_cpu_read(c->pcpu->sectors_available);
- do {
- if (sectors > old)
- return __bch2_disk_reservation_add(c, res, sectors, flags);
-
- new = old - sectors;
- } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
-
- this_cpu_add(*c->online_reserved, sectors);
- res->sectors += sectors;
- return 0;
-#else
- return __bch2_disk_reservation_add(c, res, sectors, flags);
-#endif
-}
-
-static inline struct disk_reservation
-bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
-{
- return (struct disk_reservation) {
- .sectors = 0,
-#if 0
- /* not used yet: */
- .gen = c->capacity_gen,
-#endif
- .nr_replicas = nr_replicas,
- };
-}
-
-static inline int bch2_disk_reservation_get(struct bch_fs *c,
- struct disk_reservation *res,
- u64 sectors, unsigned nr_replicas,
- int flags)
-{
- *res = bch2_disk_reservation_init(c, nr_replicas);
-
- return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
-}
-
-#define RESERVE_FACTOR 6
-
-static inline u64 avail_factor(u64 r)
-{
- return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
-}
-
-void bch2_buckets_nouse_free(struct bch_fs *);
-int bch2_buckets_nouse_alloc(struct bch_fs *);
-
-int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
-void bch2_dev_buckets_free(struct bch_dev *);
-int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
-
-#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
deleted file mode 100644
index 0aed2500ade3..000000000000
--- a/fs/bcachefs/buckets_types.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_TYPES_H
-#define _BUCKETS_TYPES_H
-
-#include "bcachefs_format.h"
-#include "util.h"
-
-#define BUCKET_JOURNAL_SEQ_BITS 16
-
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- * while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR 0
-#else
-#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
- ulong ulong;
- u8 byte;
-};
-
-struct bucket {
- u8 lock;
- u8 gen_valid:1;
- u8 data_type:7;
- u8 gen;
- u8 stripe_redundancy;
- u32 stripe;
- u32 dirty_sectors;
- u32 cached_sectors;
- u32 stripe_sectors;
-} __aligned(sizeof(long));
-
-struct bucket_gens {
- struct rcu_head rcu;
- u16 first_bucket;
- size_t nbuckets;
- size_t nbuckets_minus_first;
- u8 b[] __counted_by(nbuckets);
-};
-
-/* Only info on bucket countns: */
-struct bch_dev_usage {
- u64 buckets[BCH_DATA_NR];
-};
-
-struct bch_dev_usage_full {
- struct bch_dev_usage_type {
- u64 buckets;
- u64 sectors; /* _compressed_ sectors: */
- /*
- * XXX
- * Why do we have this? Isn't it just buckets * bucket_size -
- * sectors?
- */
- u64 fragmented;
- } d[BCH_DATA_NR];
-};
-
-struct bch_fs_usage_base {
- u64 hidden;
- u64 btree;
- u64 data;
- u64 cached;
- u64 reserved;
- u64 nr_inodes;
-};
-
-struct bch_fs_usage_short {
- u64 capacity;
- u64 used;
- u64 free;
- u64 nr_inodes;
-};
-
-/*
- * A reservation for space on disk:
- */
-struct disk_reservation {
- u64 sectors;
- u32 gen;
- unsigned nr_replicas;
-};
-
-#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
deleted file mode 100644
index 832eff93acb6..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets_waiting_for_journal.h"
-#include <linux/hash.h>
-#include <linux/random.h>
-
-static inline struct bucket_hashed *
-bucket_hash(struct buckets_waiting_for_journal_table *t,
- unsigned hash_seed_idx, u64 dev_bucket)
-{
- return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
-}
-
-static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
-{
- unsigned i;
-
- t->bits = bits;
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
- get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
- memset(t->d, 0, sizeof(t->d[0]) << t->bits);
-}
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
- unsigned dev, u64 bucket)
-{
- struct buckets_waiting_for_journal_table *t;
- u64 dev_bucket = (u64) dev << 56 | bucket;
- u64 ret = 0;
-
- mutex_lock(&b->lock);
- t = b->t;
-
- for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
- struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
-
- if (h->dev_bucket == dev_bucket) {
- ret = h->journal_seq;
- break;
- }
- }
-
- mutex_unlock(&b->lock);
-
- return ret;
-}
-
-static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
- struct bucket_hashed *new,
- u64 flushed_seq)
-{
- struct bucket_hashed *last_evicted = NULL;
- unsigned tries, i;
-
- for (tries = 0; tries < 10; tries++) {
- struct bucket_hashed *old, *victim = NULL;
-
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
- old = bucket_hash(t, i, new->dev_bucket);
-
- if (old->dev_bucket == new->dev_bucket ||
- old->journal_seq <= flushed_seq) {
- *old = *new;
- return true;
- }
-
- if (last_evicted != old)
- victim = old;
- }
-
- /* hashed to same slot 3 times: */
- if (!victim)
- break;
-
- /* Failed to find an empty slot: */
- swap(*new, *victim);
- last_evicted = victim;
- }
-
- return false;
-}
-
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
- u64 flushed_seq,
- unsigned dev, u64 bucket,
- u64 journal_seq)
-{
- struct buckets_waiting_for_journal_table *t, *n;
- struct bucket_hashed tmp, new = {
- .dev_bucket = (u64) dev << 56 | bucket,
- .journal_seq = journal_seq,
- };
- size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
- int ret = 0;
-
- mutex_lock(&b->lock);
-
- if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
- goto out;
-
- t = b->t;
- size = 1UL << t->bits;
- for (i = 0; i < size; i++)
- nr_elements += t->d[i].journal_seq > flushed_seq;
-
- new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
-realloc:
- n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
- if (!n) {
- struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
- ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
- goto out;
- }
-
-retry_rehash:
- if (nr_rehashes_this_size == 3) {
- new_bits++;
- nr_rehashes_this_size = 0;
- kvfree(n);
- goto realloc;
- }
-
- nr_rehashes++;
- nr_rehashes_this_size++;
-
- bucket_table_init(n, new_bits);
-
- tmp = new;
- BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
-
- for (i = 0; i < 1UL << t->bits; i++) {
- if (t->d[i].journal_seq <= flushed_seq)
- continue;
-
- tmp = t->d[i];
- if (!bucket_table_insert(n, &tmp, flushed_seq))
- goto retry_rehash;
- }
-
- b->t = n;
- kvfree(t);
-
- pr_debug("took %zu rehashes, table at %zu/%lu elements",
- nr_rehashes, nr_elements, 1UL << b->t->bits);
-out:
- mutex_unlock(&b->lock);
-
- return ret;
-}
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
-{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
- kvfree(b->t);
-}
-
-#define INITIAL_TABLE_BITS 3
-
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
-{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
-
- mutex_init(&b->lock);
-
- b->t = kvmalloc(sizeof(*b->t) +
- (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
- if (!b->t)
- return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
-
- bucket_table_init(b->t, INITIAL_TABLE_BITS);
- return 0;
-}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
deleted file mode 100644
index 365619ca44c8..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_H
-
-#include "buckets_waiting_for_journal_types.h"
-
-u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
- unsigned, u64);
-int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
- u64, unsigned, u64, u64);
-
-void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
-int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
deleted file mode 100644
index e593db061d81..000000000000
--- a/fs/bcachefs/buckets_waiting_for_journal_types.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
-
-#include <linux/siphash.h>
-
-struct bucket_hashed {
- u64 dev_bucket;
- u64 journal_seq;
-};
-
-struct buckets_waiting_for_journal_table {
- unsigned bits;
- u64 hash_seeds[3];
- struct bucket_hashed d[];
-};
-
-struct buckets_waiting_for_journal {
- struct mutex lock;
- struct buckets_waiting_for_journal_table *t;
-};
-
-#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
deleted file mode 100644
index 5ea89aa2b0c4..000000000000
--- a/fs/bcachefs/chardev.c
+++ /dev/null
@@ -1,843 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_CHARDEV
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "disk_accounting.h"
-#include "fsck.h"
-#include "journal.h"
-#include "move.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-counters.h"
-#include "super-io.h"
-#include "thread_with_file.h"
-
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/fs.h>
-#include <linux/ioctl.h>
-#include <linux/major.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
- unsigned flags)
-{
- struct bch_dev *ca;
-
- if (flags & BCH_BY_INDEX) {
- if (dev >= c->sb.nr_devices)
- return ERR_PTR(-EINVAL);
-
- ca = bch2_dev_tryget_noerror(c, dev);
- if (!ca)
- return ERR_PTR(-EINVAL);
- } else {
- char *path;
-
- path = strndup_user((const char __user *)
- (unsigned long) dev, PATH_MAX);
- if (IS_ERR(path))
- return ERR_CAST(path);
-
- ca = bch2_dev_lookup(c, path);
- kfree(path);
- }
-
- return ca;
-}
-
-#if 0
-static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
-{
- struct bch_ioctl_assemble arg;
- struct bch_fs *c;
- u64 *user_devs = NULL;
- char **devs = NULL;
- unsigned i;
- int ret = -EFAULT;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
- if (!user_devs)
- return -ENOMEM;
-
- devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
-
- if (copy_from_user(user_devs, user_arg->devs,
- sizeof(u64) * arg.nr_devs))
- goto err;
-
- for (i = 0; i < arg.nr_devs; i++) {
- devs[i] = strndup_user((const char __user *)(unsigned long)
- user_devs[i],
- PATH_MAX);
- ret= PTR_ERR_OR_ZERO(devs[i]);
- if (ret)
- goto err;
- }
-
- c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
- ret = PTR_ERR_OR_ZERO(c);
- if (!ret)
- closure_put(&c->cl);
-err:
- if (devs)
- for (i = 0; i < arg.nr_devs; i++)
- kfree(devs[i]);
- kfree(devs);
- return ret;
-}
-
-static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
-{
- struct bch_ioctl_incremental arg;
- const char *err;
- char *path;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(path);
- if (ret)
- return ret;
-
- err = bch2_fs_open_incremental(path);
- kfree(path);
-
- if (err) {
- pr_err("Could not register bcachefs devices: %s", err);
- return -EINVAL;
- }
-
- return 0;
-}
-#endif
-
-static long bch2_global_ioctl(unsigned cmd, void __user *arg)
-{
- long ret;
-
- switch (cmd) {
-#if 0
- case BCH_IOCTL_ASSEMBLE:
- return bch2_ioctl_assemble(arg);
- case BCH_IOCTL_INCREMENTAL:
- return bch2_ioctl_incremental(arg);
-#endif
- case BCH_IOCTL_FSCK_OFFLINE: {
- ret = bch2_ioctl_fsck_offline(arg);
- break;
- }
- default:
- ret = -ENOTTY;
- break;
- }
-
- if (ret < 0)
- ret = bch2_err_class(ret);
- return ret;
-}
-
-static long bch2_ioctl_query_uuid(struct bch_fs *c,
- struct bch_ioctl_query_uuid __user *user_arg)
-{
- return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
- sizeof(c->sb.user_uuid));
-}
-
-#if 0
-static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- return bch2_fs_start(c);
-}
-
-static long bch2_ioctl_stop(struct bch_fs *c)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- bch2_fs_stop(c);
- return 0;
-}
-#endif
-
-static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- char *path;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(path);
- if (ret)
- return ret;
-
- ret = bch2_dev_add(c, path);
- if (!IS_ERR(path))
- kfree(path);
-
- return ret;
-}
-
-static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- struct bch_dev *ca;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
- BCH_FORCE_IF_METADATA_LOST|
- BCH_FORCE_IF_DEGRADED|
- BCH_BY_INDEX)) ||
- arg.pad)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- return bch2_dev_remove(c, ca, arg.flags);
-}
-
-static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- char *path;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(path);
- if (ret)
- return ret;
-
- ret = bch2_dev_online(c, path);
- kfree(path);
- return ret;
-}
-
-static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
- BCH_FORCE_IF_METADATA_LOST|
- BCH_FORCE_IF_DEGRADED|
- BCH_BY_INDEX)) ||
- arg.pad)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_dev_offline(c, ca, arg.flags);
- bch2_dev_put(ca);
- return ret;
-}
-
-static long bch2_ioctl_disk_set_state(struct bch_fs *c,
- struct bch_ioctl_disk_set_state arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
- BCH_FORCE_IF_METADATA_LOST|
- BCH_FORCE_IF_DEGRADED|
- BCH_BY_INDEX)) ||
- arg.pad[0] || arg.pad[1] || arg.pad[2] ||
- arg.new_state >= BCH_MEMBER_STATE_NR)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
- if (ret)
- bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
-
- bch2_dev_put(ca);
- return ret;
-}
-
-struct bch_data_ctx {
- struct thread_with_file thr;
-
- struct bch_fs *c;
- struct bch_ioctl_data arg;
- struct bch_move_stats stats;
-};
-
-static int bch2_data_thread(void *arg)
-{
- struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
-
- ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
- if (ctx->thr.ret == -BCH_ERR_device_offline)
- ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
- else {
- ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
- ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
- }
- enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data);
- return 0;
-}
-
-static int bch2_data_job_release(struct inode *inode, struct file *file)
-{
- struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
-
- bch2_thread_with_file_exit(&ctx->thr);
- kfree(ctx);
- return 0;
-}
-
-static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
- struct bch_fs *c = ctx->c;
- struct bch_ioctl_data_event e = {
- .type = BCH_DATA_EVENT_PROGRESS,
- .ret = ctx->stats.ret,
- .p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.pos.btree,
- .p.pos = ctx->stats.pos.pos,
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
- .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
- .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
- };
-
- if (ctx->arg.op == BCH_DATA_OP_scrub) {
- struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
- if (ca) {
- struct bch_dev_usage_full u;
- bch2_dev_usage_full_read_fast(ca, &u);
- for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
- if (ctx->arg.scrub.data_types & BIT(i))
- e.p.sectors_total += u.d[i].sectors;
- bch2_dev_put(ca);
- }
- } else {
- e.p.sectors_total = bch2_fs_usage_read_short(c).used;
- }
-
- if (len < sizeof(e))
- return -EINVAL;
-
- return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
-}
-
-static const struct file_operations bcachefs_data_ops = {
- .release = bch2_data_job_release,
- .read = bch2_data_job_read,
-};
-
-static long bch2_ioctl_data(struct bch_fs *c,
- struct bch_ioctl_data arg)
-{
- struct bch_data_ctx *ctx;
- int ret;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data))
- return -EROFS;
-
- if (!capable(CAP_SYS_ADMIN)) {
- ret = -EPERM;
- goto put_ref;
- }
-
- if (arg.op >= BCH_DATA_OP_NR || arg.flags) {
- ret = -EINVAL;
- goto put_ref;
- }
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
- ret = -ENOMEM;
- goto put_ref;
- }
-
- ctx->c = c;
- ctx->arg = arg;
-
- ret = bch2_run_thread_with_file(&ctx->thr,
- &bcachefs_data_ops,
- bch2_data_thread);
- if (ret < 0)
- goto cleanup;
- return ret;
-cleanup:
- kfree(ctx);
-put_ref:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data);
- return ret;
-}
-
-static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c,
- struct bch_ioctl_fs_usage __user *user_arg)
-{
- struct bch_ioctl_fs_usage arg = {};
- darray_char replicas = {};
- u32 replica_entries_bytes;
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
- return -EFAULT;
-
- ret = bch2_fs_replicas_usage_read(c, &replicas) ?:
- (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
- copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
- if (ret)
- goto err;
-
- struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
- arg.capacity = c->capacity;
- arg.used = u.used;
- arg.online_reserved = percpu_u64_get(c->online_reserved);
- arg.replica_entries_bytes = replicas.nr;
-
- for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
- struct disk_accounting_pos k;
- disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i);
-
- bch2_accounting_mem_read(c,
- disk_accounting_pos_to_bpos(&k),
- &arg.persistent_reserved[i], 1);
- }
-
- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
- darray_exit(&replicas);
- return ret;
-}
-
-static long bch2_ioctl_query_accounting(struct bch_fs *c,
- struct bch_ioctl_query_accounting __user *user_arg)
-{
- struct bch_ioctl_query_accounting arg;
- darray_char accounting = {};
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
- bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
- (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
- copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
- if (ret)
- goto err;
-
- arg.capacity = c->capacity;
- arg.used = bch2_fs_usage_read_short(c).used;
- arg.online_reserved = percpu_u64_get(c->online_reserved);
- arg.accounting_u64s = accounting.nr / sizeof(u64);
-
- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-err:
- darray_exit(&accounting);
- return ret;
-}
-
-/* obsolete, didn't allow for new data types: */
-static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c,
- struct bch_ioctl_dev_usage __user *user_arg)
-{
- struct bch_ioctl_dev_usage arg;
- struct bch_dev_usage_full src;
- struct bch_dev *ca;
- unsigned i;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad[0] ||
- arg.pad[1] ||
- arg.pad[2])
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- src = bch2_dev_usage_full_read(ca);
-
- arg.state = ca->mi.state;
- arg.bucket_size = ca->mi.bucket_size;
- arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-
- for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
- arg.d[i].buckets = src.d[i].buckets;
- arg.d[i].sectors = src.d[i].sectors;
- arg.d[i].fragmented = src.d[i].fragmented;
- }
-
- bch2_dev_put(ca);
-
- return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
-}
-
-static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
- struct bch_ioctl_dev_usage_v2 __user *user_arg)
-{
- struct bch_ioctl_dev_usage_v2 arg;
- struct bch_dev_usage_full src;
- struct bch_dev *ca;
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad[0] ||
- arg.pad[1] ||
- arg.pad[2])
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- src = bch2_dev_usage_full_read(ca);
-
- arg.state = ca->mi.state;
- arg.bucket_size = ca->mi.bucket_size;
- arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR);
- arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-
- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
- if (ret)
- goto err;
-
- for (unsigned i = 0; i < arg.nr_data_types; i++) {
- struct bch_ioctl_dev_usage_type t = {
- .buckets = src.d[i].buckets,
- .sectors = src.d[i].sectors,
- .fragmented = src.d[i].fragmented,
- };
-
- ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
- if (ret)
- goto err;
- }
-err:
- bch2_dev_put(ca);
- return ret;
-}
-
-static long bch2_ioctl_read_super(struct bch_fs *c,
- struct bch_ioctl_read_super arg)
-{
- struct bch_dev *ca = NULL;
- struct bch_sb *sb;
- int ret = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
- arg.pad)
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- if (arg.flags & BCH_READ_DEV) {
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- ret = PTR_ERR_OR_ZERO(ca);
- if (ret)
- goto err_unlock;
-
- sb = ca->disk_sb.sb;
- } else {
- sb = c->disk_sb.sb;
- }
-
- if (vstruct_bytes(sb) > arg.size) {
- ret = -ERANGE;
- goto err;
- }
-
- ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
- vstruct_bytes(sb));
-err:
- bch2_dev_put(ca);
-err_unlock:
- mutex_unlock(&c->sb_lock);
- return ret;
-}
-
-static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
- struct bch_ioctl_disk_get_idx arg)
-{
- dev_t dev = huge_decode_dev(arg.dev);
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- guard(rcu)();
- for_each_online_member_rcu(c, ca)
- if (ca->dev == dev)
- return ca->dev_idx;
-
- return bch_err_throw(c, ENOENT_dev_idx_not_found);
-}
-
-static long bch2_ioctl_disk_resize(struct bch_fs *c,
- struct bch_ioctl_disk_resize arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_dev_resize(c, ca, arg.nbuckets);
-
- bch2_dev_put(ca);
- return ret;
-}
-
-static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
- struct bch_ioctl_disk_resize_journal arg)
-{
- struct bch_dev *ca;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if ((arg.flags & ~BCH_BY_INDEX) ||
- arg.pad)
- return -EINVAL;
-
- if (arg.nbuckets > U32_MAX)
- return -EINVAL;
-
- ca = bch2_device_lookup(c, arg.dev, arg.flags);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
-
- bch2_dev_put(ca);
- return ret;
-}
-
-#define BCH_IOCTL(_name, _argtype) \
-do { \
- _argtype i; \
- \
- if (copy_from_user(&i, arg, sizeof(i))) \
- return -EFAULT; \
- ret = bch2_ioctl_##_name(c, i); \
- goto out; \
-} while (0)
-
-long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
-{
- long ret;
-
- switch (cmd) {
- case BCH_IOCTL_QUERY_UUID:
- return bch2_ioctl_query_uuid(c, arg);
- case BCH_IOCTL_FS_USAGE:
- return bch2_ioctl_fs_usage(c, arg);
- case BCH_IOCTL_DEV_USAGE:
- return bch2_ioctl_dev_usage(c, arg);
- case BCH_IOCTL_DEV_USAGE_V2:
- return bch2_ioctl_dev_usage_v2(c, arg);
-#if 0
- case BCH_IOCTL_START:
- BCH_IOCTL(start, struct bch_ioctl_start);
- case BCH_IOCTL_STOP:
- return bch2_ioctl_stop(c);
-#endif
- case BCH_IOCTL_READ_SUPER:
- BCH_IOCTL(read_super, struct bch_ioctl_read_super);
- case BCH_IOCTL_DISK_GET_IDX:
- BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
- }
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EINVAL;
-
- switch (cmd) {
- case BCH_IOCTL_DISK_ADD:
- BCH_IOCTL(disk_add, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_REMOVE:
- BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_ONLINE:
- BCH_IOCTL(disk_online, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_OFFLINE:
- BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
- case BCH_IOCTL_DISK_SET_STATE:
- BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
- case BCH_IOCTL_DATA:
- BCH_IOCTL(data, struct bch_ioctl_data);
- case BCH_IOCTL_DISK_RESIZE:
- BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
- case BCH_IOCTL_DISK_RESIZE_JOURNAL:
- BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
- case BCH_IOCTL_FSCK_ONLINE:
- BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
- case BCH_IOCTL_QUERY_ACCOUNTING:
- return bch2_ioctl_query_accounting(c, arg);
- case BCH_IOCTL_QUERY_COUNTERS:
- return bch2_ioctl_query_counters(c, arg);
- default:
- return -ENOTTY;
- }
-out:
- if (ret < 0)
- ret = bch2_err_class(ret);
- return ret;
-}
-
-static DEFINE_IDR(bch_chardev_minor);
-
-static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
-{
- unsigned minor = iminor(file_inode(filp));
- struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
- void __user *arg = (void __user *) v;
-
- return c
- ? bch2_fs_ioctl(c, cmd, arg)
- : bch2_global_ioctl(cmd, arg);
-}
-
-static const struct file_operations bch_chardev_fops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = bch2_chardev_ioctl,
- .open = nonseekable_open,
-};
-
-static int bch_chardev_major;
-static const struct class bch_chardev_class = {
- .name = "bcachefs",
-};
-static struct device *bch_chardev;
-
-void bch2_fs_chardev_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->chardev))
- device_unregister(c->chardev);
- if (c->minor >= 0)
- idr_remove(&bch_chardev_minor, c->minor);
-}
-
-int bch2_fs_chardev_init(struct bch_fs *c)
-{
- c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
- if (c->minor < 0)
- return c->minor;
-
- c->chardev = device_create(&bch_chardev_class, NULL,
- MKDEV(bch_chardev_major, c->minor), c,
- "bcachefs%u-ctl", c->minor);
- if (IS_ERR(c->chardev))
- return PTR_ERR(c->chardev);
-
- return 0;
-}
-
-void bch2_chardev_exit(void)
-{
- device_destroy(&bch_chardev_class, MKDEV(bch_chardev_major, U8_MAX));
- class_unregister(&bch_chardev_class);
- if (bch_chardev_major > 0)
- unregister_chrdev(bch_chardev_major, "bcachefs");
-}
-
-int __init bch2_chardev_init(void)
-{
- int ret;
-
- bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
- if (bch_chardev_major < 0)
- return bch_chardev_major;
-
- ret = class_register(&bch_chardev_class);
- if (ret)
- goto major_out;
-
- bch_chardev = device_create(&bch_chardev_class, NULL,
- MKDEV(bch_chardev_major, U8_MAX),
- NULL, "bcachefs-ctl");
- if (IS_ERR(bch_chardev)) {
- ret = PTR_ERR(bch_chardev);
- goto class_out;
- }
-
- return 0;
-
-class_out:
- class_unregister(&bch_chardev_class);
-major_out:
- unregister_chrdev(bch_chardev_major, "bcachefs-ctl");
- return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
deleted file mode 100644
index 0f563ca53c36..000000000000
--- a/fs/bcachefs/chardev.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHARDEV_H
-#define _BCACHEFS_CHARDEV_H
-
-#ifndef NO_BCACHEFS_FS
-
-long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
-
-void bch2_fs_chardev_exit(struct bch_fs *);
-int bch2_fs_chardev_init(struct bch_fs *);
-
-void bch2_chardev_exit(void);
-int __init bch2_chardev_init(void);
-
-#else
-
-static inline long bch2_fs_ioctl(struct bch_fs *c,
- unsigned cmd, void __user * arg)
-{
- return -ENOTTY;
-}
-
-static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
-static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_chardev_exit(void) {}
-static inline int __init bch2_chardev_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
deleted file mode 100644
index a6795e73f0b9..000000000000
--- a/fs/bcachefs/checksum.c
+++ /dev/null
@@ -1,698 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "errcode.h"
-#include "error.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/crc32c.h>
-#include <linux/xxhash.h>
-#include <linux/key.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <crypto/chacha.h>
-#include <crypto/poly1305.h>
-#include <keys/user-type.h>
-
-/*
- * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
- * it features page merging without having the checksum algorithm lose its state.
- * for native checksum aglorithms (like crc), a default seed value will do.
- * for hash-like algorithms, a state needs to be stored
- */
-
-struct bch2_checksum_state {
- union {
- u64 seed;
- struct xxh64_state h64state;
- };
- unsigned int type;
-};
-
-static void bch2_checksum_init(struct bch2_checksum_state *state)
-{
- switch (state->type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_crc64:
- state->seed = 0;
- break;
- case BCH_CSUM_crc32c_nonzero:
- state->seed = U32_MAX;
- break;
- case BCH_CSUM_crc64_nonzero:
- state->seed = U64_MAX;
- break;
- case BCH_CSUM_xxhash:
- xxh64_reset(&state->h64state, 0);
- break;
- default:
- BUG();
- }
-}
-
-static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
-{
- switch (state->type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_crc64:
- return state->seed;
- case BCH_CSUM_crc32c_nonzero:
- return state->seed ^ U32_MAX;
- case BCH_CSUM_crc64_nonzero:
- return state->seed ^ U64_MAX;
- case BCH_CSUM_xxhash:
- return xxh64_digest(&state->h64state);
- default:
- BUG();
- }
-}
-
-static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
-{
- switch (state->type) {
- case BCH_CSUM_none:
- return;
- case BCH_CSUM_crc32c_nonzero:
- case BCH_CSUM_crc32c:
- state->seed = crc32c(state->seed, data, len);
- break;
- case BCH_CSUM_crc64_nonzero:
- case BCH_CSUM_crc64:
- state->seed = crc64_be(state->seed, data, len);
- break;
- case BCH_CSUM_xxhash:
- xxh64_update(&state->h64state, data, len);
- break;
- default:
- BUG();
- }
-}
-
-static void bch2_chacha20_init(struct chacha_state *state,
- const struct bch_key *key, struct nonce nonce)
-{
- u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
-
- BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
- memcpy(key_words, key, sizeof(key_words));
- le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
-
- BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
- chacha_init(state, key_words, (const u8 *)nonce.d);
-
- memzero_explicit(key_words, sizeof(key_words));
-}
-
-void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
- void *data, size_t len)
-{
- struct chacha_state state;
-
- bch2_chacha20_init(&state, key, nonce);
- chacha20_crypt(&state, data, data, len);
- chacha_zeroize_state(&state);
-}
-
-static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
- struct bch_fs *c, struct nonce nonce)
-{
- u8 key[POLY1305_KEY_SIZE] = { 0 };
-
- nonce.d[3] ^= BCH_NONCE_POLY;
-
- bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
- poly1305_init(desc, key);
-}
-
-struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
- struct nonce nonce, const void *data, size_t len)
-{
- switch (type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c_nonzero:
- case BCH_CSUM_crc64_nonzero:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_xxhash:
- case BCH_CSUM_crc64: {
- struct bch2_checksum_state state;
-
- state.type = type;
-
- bch2_checksum_init(&state);
- bch2_checksum_update(&state, data, len);
-
- return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
- }
-
- case BCH_CSUM_chacha20_poly1305_80:
- case BCH_CSUM_chacha20_poly1305_128: {
- struct poly1305_desc_ctx dctx;
- u8 digest[POLY1305_DIGEST_SIZE];
- struct bch_csum ret = { 0 };
-
- bch2_poly1305_init(&dctx, c, nonce);
- poly1305_update(&dctx, data, len);
- poly1305_final(&dctx, digest);
-
- memcpy(&ret, digest, bch_crc_bytes[type]);
- return ret;
- }
- default:
- return (struct bch_csum) {};
- }
-}
-
-int bch2_encrypt(struct bch_fs *c, unsigned type,
- struct nonce nonce, void *data, size_t len)
-{
- if (!bch2_csum_type_is_encryption(type))
- return 0;
-
- if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
- c, "attempting to encrypt without encryption key"))
- return bch_err_throw(c, no_encryption_key);
-
- bch2_chacha20(&c->chacha20_key, nonce, data, len);
- return 0;
-}
-
-static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv;
-
- switch (type) {
- case BCH_CSUM_none:
- return (struct bch_csum) { 0 };
- case BCH_CSUM_crc32c_nonzero:
- case BCH_CSUM_crc64_nonzero:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_xxhash:
- case BCH_CSUM_crc64: {
- struct bch2_checksum_state state;
-
- state.type = type;
- bch2_checksum_init(&state);
-
-#ifdef CONFIG_HIGHMEM
- __bio_for_each_segment(bv, bio, *iter, *iter) {
- void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
- bch2_checksum_update(&state, p, bv.bv_len);
- kunmap_local(p);
- }
-#else
- __bio_for_each_bvec(bv, bio, *iter, *iter)
- bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
- bv.bv_len);
-#endif
- return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
- }
-
- case BCH_CSUM_chacha20_poly1305_80:
- case BCH_CSUM_chacha20_poly1305_128: {
- struct poly1305_desc_ctx dctx;
- u8 digest[POLY1305_DIGEST_SIZE];
- struct bch_csum ret = { 0 };
-
- bch2_poly1305_init(&dctx, c, nonce);
-
-#ifdef CONFIG_HIGHMEM
- __bio_for_each_segment(bv, bio, *iter, *iter) {
- void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
-
- poly1305_update(&dctx, p, bv.bv_len);
- kunmap_local(p);
- }
-#else
- __bio_for_each_bvec(bv, bio, *iter, *iter)
- poly1305_update(&dctx,
- page_address(bv.bv_page) + bv.bv_offset,
- bv.bv_len);
-#endif
- poly1305_final(&dctx, digest);
-
- memcpy(&ret, digest, bch_crc_bytes[type]);
- return ret;
- }
- default:
- return (struct bch_csum) {};
- }
-}
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- struct bvec_iter iter = bio->bi_iter;
-
- return __bch2_checksum_bio(c, type, nonce, bio, &iter);
-}
-
-int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- struct chacha_state chacha_state;
- int ret = 0;
-
- if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
- c, "attempting to encrypt without encryption key"))
- return bch_err_throw(c, no_encryption_key);
-
- bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce);
-
- bio_for_each_segment(bv, bio, iter) {
- void *p;
-
- /*
- * chacha_crypt() assumes that the length is a multiple of
- * CHACHA_BLOCK_SIZE on any non-final call.
- */
- if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
- bch_err_ratelimited(c, "bio not aligned for encryption");
- ret = -EIO;
- break;
- }
-
- p = bvec_kmap_local(&bv);
- chacha20_crypt(&chacha_state, p, p, bv.bv_len);
- kunmap_local(p);
- }
- chacha_zeroize_state(&chacha_state);
- return ret;
-}
-
-struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
- struct bch_csum b, size_t b_len)
-{
- struct bch2_checksum_state state;
-
- state.type = type;
- bch2_checksum_init(&state);
- state.seed = le64_to_cpu(a.lo);
-
- BUG_ON(!bch2_checksum_mergeable(type));
-
- while (b_len) {
- unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
-
- bch2_checksum_update(&state,
- page_address(ZERO_PAGE(0)), page_len);
- b_len -= page_len;
- }
- a.lo = cpu_to_le64(bch2_checksum_final(&state));
- a.lo ^= b.lo;
- a.hi ^= b.hi;
- return a;
-}
-
-int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
- struct bversion version,
- struct bch_extent_crc_unpacked crc_old,
- struct bch_extent_crc_unpacked *crc_a,
- struct bch_extent_crc_unpacked *crc_b,
- unsigned len_a, unsigned len_b,
- unsigned new_csum_type)
-{
- struct bvec_iter iter = bio->bi_iter;
- struct nonce nonce = extent_nonce(version, crc_old);
- struct bch_csum merged = { 0 };
- struct crc_split {
- struct bch_extent_crc_unpacked *crc;
- unsigned len;
- unsigned csum_type;
- struct bch_csum csum;
- } splits[3] = {
- { crc_a, len_a, new_csum_type, { 0 }},
- { crc_b, len_b, new_csum_type, { 0 } },
- { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
- }, *i;
- bool mergeable = crc_old.csum_type == new_csum_type &&
- bch2_checksum_mergeable(new_csum_type);
- unsigned crc_nonce = crc_old.nonce;
-
- BUG_ON(len_a + len_b > bio_sectors(bio));
- BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
- BUG_ON(crc_is_compressed(crc_old));
- BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
- bch2_csum_type_is_encryption(new_csum_type));
-
- for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
- iter.bi_size = i->len << 9;
- if (mergeable || i->crc)
- i->csum = __bch2_checksum_bio(c, i->csum_type,
- nonce, bio, &iter);
- else
- bio_advance_iter(bio, &iter, i->len << 9);
- nonce = nonce_add(nonce, i->len << 9);
- }
-
- if (mergeable)
- for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
- merged = bch2_checksum_merge(new_csum_type, merged,
- i->csum, i->len << 9);
- else
- merged = bch2_checksum_bio(c, crc_old.csum_type,
- extent_nonce(version, crc_old), bio);
-
- if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
- " expected %0llx:%0llx got %0llx:%0llx (old type ",
- __func__,
- crc_old.csum.hi,
- crc_old.csum.lo,
- merged.hi,
- merged.lo);
- bch2_prt_csum_type(&buf, crc_old.csum_type);
- prt_str(&buf, " new type ");
- bch2_prt_csum_type(&buf, new_csum_type);
- prt_str(&buf, ")");
- WARN_RATELIMIT(1, "%s", buf.buf);
- printbuf_exit(&buf);
- return bch_err_throw(c, recompute_checksum);
- }
-
- for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
- if (i->crc)
- *i->crc = (struct bch_extent_crc_unpacked) {
- .csum_type = i->csum_type,
- .compression_type = crc_old.compression_type,
- .compressed_size = i->len,
- .uncompressed_size = i->len,
- .offset = 0,
- .live_size = i->len,
- .nonce = crc_nonce,
- .csum = i->csum,
- };
-
- if (bch2_csum_type_is_encryption(new_csum_type))
- crc_nonce += i->len;
- }
-
- return 0;
-}
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&crypt->field), sizeof(*crypt));
- return -BCH_ERR_invalid_sb_crypt;
- }
-
- if (BCH_CRYPT_KDF_TYPE(crypt)) {
- prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
- return -BCH_ERR_invalid_sb_crypt;
- }
-
- return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- prt_printf(out, "KFD: %llu\n", BCH_CRYPT_KDF_TYPE(crypt));
- prt_printf(out, "scrypt n: %llu\n", BCH_KDF_SCRYPT_N(crypt));
- prt_printf(out, "scrypt r: %llu\n", BCH_KDF_SCRYPT_R(crypt));
- prt_printf(out, "scrypt p: %llu\n", BCH_KDF_SCRYPT_P(crypt));
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
- .validate = bch2_sb_crypt_validate,
- .to_text = bch2_sb_crypt_to_text,
-};
-
-#ifdef __KERNEL__
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
- struct key *keyring_key;
- const struct user_key_payload *ukp;
- int ret;
-
- keyring_key = request_key(&key_type_user, key_description, NULL);
- if (IS_ERR(keyring_key))
- return PTR_ERR(keyring_key);
-
- down_read(&keyring_key->sem);
- ukp = dereference_key_locked(keyring_key);
- if (ukp->datalen == sizeof(*key)) {
- memcpy(key, ukp->data, ukp->datalen);
- ret = 0;
- } else {
- ret = -EINVAL;
- }
- up_read(&keyring_key->sem);
- key_put(keyring_key);
-
- return ret;
-}
-#else
-#include <keyutils.h>
-
-static int __bch2_request_key(char *key_description, struct bch_key *key)
-{
- key_serial_t key_id;
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_SESSION_KEYRING);
- if (key_id >= 0)
- goto got_key;
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_USER_KEYRING);
- if (key_id >= 0)
- goto got_key;
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_USER_SESSION_KEYRING);
- if (key_id >= 0)
- goto got_key;
-
- return -errno;
-got_key:
-
- if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
- return -1;
-
- return 0;
-}
-
-#include "crypto.h"
-#endif
-
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
-{
- struct printbuf key_description = PRINTBUF;
- int ret;
-
- prt_printf(&key_description, "bcachefs:");
- pr_uuid(&key_description, sb->user_uuid.b);
-
- ret = __bch2_request_key(key_description.buf, key);
- printbuf_exit(&key_description);
-
-#ifndef __KERNEL__
- if (ret) {
- char *passphrase = read_passphrase("Enter passphrase: ");
- struct bch_encrypted_key sb_key;
-
- bch2_passphrase_check(sb, passphrase,
- key, &sb_key);
- ret = 0;
- }
-#endif
-
- /* stash with memfd, pass memfd fd to mount */
-
- return ret;
-}
-
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *sb)
-{
- key_serial_t key_id;
- struct printbuf key_description = PRINTBUF;
-
- prt_printf(&key_description, "bcachefs:");
- pr_uuid(&key_description, sb->user_uuid.b);
-
- key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
- printbuf_exit(&key_description);
- if (key_id < 0)
- return errno;
-
- keyctl_revoke(key_id);
-
- return 0;
-}
-#endif
-
-int bch2_decrypt_sb_key(struct bch_fs *c,
- struct bch_sb_field_crypt *crypt,
- struct bch_key *key)
-{
- struct bch_encrypted_key sb_key = crypt->key;
- struct bch_key user_key;
- int ret = 0;
-
- /* is key encrypted? */
- if (!bch2_key_is_encrypted(&sb_key))
- goto out;
-
- ret = bch2_request_key(c->disk_sb.sb, &user_key);
- if (ret) {
- bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
- goto err;
- }
-
- /* decrypt real key: */
- bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
-
- if (bch2_key_is_encrypted(&sb_key)) {
- bch_err(c, "incorrect encryption key");
- ret = -EINVAL;
- goto err;
- }
-out:
- *key = sb_key.key;
-err:
- memzero_explicit(&sb_key, sizeof(sb_key));
- memzero_explicit(&user_key, sizeof(user_key));
- return ret;
-}
-
-#if 0
-
-/*
- * This seems to be duplicating code in cmd_remove_passphrase() in
- * bcachefs-tools, but we might want to switch userspace to use this - and
- * perhaps add an ioctl for calling this at runtime, so we can take the
- * passphrase off of a mounted filesystem (which has come up).
- */
-int bch2_disable_encryption(struct bch_fs *c)
-{
- struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
- if (!crypt)
- goto out;
-
- /* is key encrypted? */
- ret = 0;
- if (bch2_key_is_encrypted(&crypt->key))
- goto out;
-
- ret = bch2_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
-
- crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC);
- crypt->key.key = key;
-
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
- bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-/*
- * For enabling encryption on an existing filesystem: not hooked up yet, but it
- * should be
- */
-int bch2_enable_encryption(struct bch_fs *c, bool keyed)
-{
- struct bch_encrypted_key key;
- struct bch_key user_key;
- struct bch_sb_field_crypt *crypt;
- int ret = -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- /* Do we already have an encryption key? */
- if (bch2_sb_field_get(c->disk_sb.sb, crypt))
- goto err;
-
- ret = bch2_alloc_ciphers(c);
- if (ret)
- goto err;
-
- key.magic = cpu_to_le64(BCH_KEY_MAGIC);
- get_random_bytes(&key.key, sizeof(key.key));
-
- if (keyed) {
- ret = bch2_request_key(c->disk_sb.sb, &user_key);
- if (ret) {
- bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
- goto err;
- }
-
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &key, sizeof(key));
- if (ret)
- goto err;
- }
-
- ret = crypto_skcipher_setkey(&c->chacha20->base,
- (void *) &key.key, sizeof(key.key));
- if (ret)
- goto err;
-
- crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
- sizeof(*crypt) / sizeof(u64));
- if (!crypt) {
- ret = bch_err_throw(c, ENOSPC_sb_crypt);
- goto err;
- }
-
- crypt->key = key;
-
- /* write superblock */
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
- bch2_write_super(c);
-err:
- mutex_unlock(&c->sb_lock);
- memzero_explicit(&user_key, sizeof(user_key));
- memzero_explicit(&key, sizeof(key));
- return ret;
-}
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *c)
-{
- memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
-}
-
-int bch2_fs_encryption_init(struct bch_fs *c)
-{
- struct bch_sb_field_crypt *crypt;
- int ret;
-
- crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
- if (!crypt)
- return 0;
-
- ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
- if (ret)
- return ret;
- c->chacha20_key_set = true;
- return 0;
-}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
deleted file mode 100644
index 7bd9cf6104ca..000000000000
--- a/fs/bcachefs/checksum.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CHECKSUM_H
-#define _BCACHEFS_CHECKSUM_H
-
-#include "bcachefs.h"
-#include "extents_types.h"
-#include "super-io.h"
-
-#include <linux/crc64.h>
-#include <crypto/chacha.h>
-
-static inline bool bch2_checksum_mergeable(unsigned type)
-{
-
- switch (type) {
- case BCH_CSUM_none:
- case BCH_CSUM_crc32c:
- case BCH_CSUM_crc64:
- return true;
- default:
- return false;
- }
-}
-
-struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
- struct bch_csum, size_t);
-
-#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
-#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
-#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
-#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
-#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
-
-struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
- const void *, size_t);
-
-/*
- * This is used for various on disk data structures - bch_sb, prio_set, bset,
- * jset: The checksum is _always_ the first field of these structs
- */
-#define csum_vstruct(_c, _type, _nonce, _i) \
-({ \
- const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
- \
- bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
-})
-
-static inline void bch2_csum_to_text(struct printbuf *out,
- enum bch_csum_type type,
- struct bch_csum csum)
-{
- const u8 *p = (u8 *) &csum;
- unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
-
- for (unsigned i = 0; i < bytes; i++)
- prt_hex_byte(out, p[i]);
-}
-
-static inline void bch2_csum_err_msg(struct printbuf *out,
- enum bch_csum_type type,
- struct bch_csum expected,
- struct bch_csum got)
-{
- prt_str(out, "checksum error, type ");
- bch2_prt_csum_type(out, type);
- prt_str(out, ": got ");
- bch2_csum_to_text(out, type, got);
- prt_str(out, " should be ");
- bch2_csum_to_text(out, type, expected);
-}
-
-void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t);
-
-int bch2_request_key(struct bch_sb *, struct bch_key *);
-#ifndef __KERNEL__
-int bch2_revoke_key(struct bch_sb *);
-#endif
-
-int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
- void *data, size_t);
-
-struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
-
-int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
- struct bch_extent_crc_unpacked,
- struct bch_extent_crc_unpacked *,
- struct bch_extent_crc_unpacked *,
- unsigned, unsigned, unsigned);
-
-int __bch2_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
-
-static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- return bch2_csum_type_is_encryption(type)
- ? __bch2_encrypt_bio(c, type, nonce, bio)
- : 0;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
-
-int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
- struct bch_key *);
-
-#if 0
-int bch2_disable_encryption(struct bch_fs *);
-int bch2_enable_encryption(struct bch_fs *, bool);
-#endif
-
-void bch2_fs_encryption_exit(struct bch_fs *);
-int bch2_fs_encryption_init(struct bch_fs *);
-
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
- bool data)
-{
- switch (type) {
- case BCH_CSUM_OPT_none:
- return BCH_CSUM_none;
- case BCH_CSUM_OPT_crc32c:
- return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
- case BCH_CSUM_OPT_crc64:
- return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
- case BCH_CSUM_OPT_xxhash:
- return BCH_CSUM_xxhash;
- default:
- BUG();
- }
-}
-
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
- struct bch_io_opts opts)
-{
- if (opts.nocow)
- return 0;
-
- if (c->sb.encryption_type)
- return c->opts.wide_macs
- ? BCH_CSUM_chacha20_poly1305_128
- : BCH_CSUM_chacha20_poly1305_80;
-
- return bch2_csum_opt_to_type(opts.data_checksum, true);
-}
-
-static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
-{
- if (c->sb.encryption_type)
- return BCH_CSUM_chacha20_poly1305_128;
-
- return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
-}
-
-static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
- unsigned type)
-{
- if (type >= BCH_CSUM_NR)
- return false;
-
- if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
- return false;
-
- return true;
-}
-
-/* returns true if not equal */
-static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
-{
- /*
- * XXX: need some way of preventing the compiler from optimizing this
- * into a form that isn't constant time..
- */
- return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
-}
-
-/* for skipping ahead and encrypting/decrypting at an offset: */
-static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
-{
- EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
-
- le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
- return nonce;
-}
-
-static inline struct nonce null_nonce(void)
-{
- struct nonce ret;
-
- memset(&ret, 0, sizeof(ret));
- return ret;
-}
-
-static inline struct nonce extent_nonce(struct bversion version,
- struct bch_extent_crc_unpacked crc)
-{
- unsigned compression_type = crc_is_compressed(crc)
- ? crc.compression_type
- : 0;
- unsigned size = compression_type ? crc.uncompressed_size : 0;
- struct nonce nonce = (struct nonce) {{
- [0] = cpu_to_le32(size << 22),
- [1] = cpu_to_le32(version.lo),
- [2] = cpu_to_le32(version.lo >> 32),
- [3] = cpu_to_le32(version.hi|
- (compression_type << 24))^BCH_NONCE_EXTENT,
- }};
-
- return nonce_add(nonce, crc.nonce << 9);
-}
-
-static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
-{
- return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
-}
-
-static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
-{
- __le64 magic = __bch2_sb_magic(sb);
-
- return (struct nonce) {{
- [0] = 0,
- [1] = 0,
- [2] = ((__le32 *) &magic)[0],
- [3] = ((__le32 *) &magic)[1],
- }};
-}
-
-static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
-{
- __le64 magic = bch2_sb_magic(c);
-
- return (struct nonce) {{
- [0] = 0,
- [1] = 0,
- [2] = ((__le32 *) &magic)[0],
- [3] = ((__le32 *) &magic)[1],
- }};
-}
-
-#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
deleted file mode 100644
index 8e9264b5a84e..000000000000
--- a/fs/bcachefs/clock.c
+++ /dev/null
@@ -1,181 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "clock.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/preempt.h>
-
-static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct io_timer **_l = (struct io_timer **)l;
- struct io_timer **_r = (struct io_timer **)r;
-
- return (*_l)->expire < (*_r)->expire;
-}
-
-static const struct min_heap_callbacks callbacks = {
- .less = io_timer_cmp,
- .swp = NULL,
-};
-
-void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
-{
- spin_lock(&clock->timer_lock);
-
- if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
- spin_unlock(&clock->timer_lock);
- timer->fn(timer);
- return;
- }
-
- for (size_t i = 0; i < clock->timers.nr; i++)
- if (clock->timers.data[i] == timer)
- goto out;
-
- BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL));
-out:
- spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
-{
- spin_lock(&clock->timer_lock);
-
- for (size_t i = 0; i < clock->timers.nr; i++)
- if (clock->timers.data[i] == timer) {
- min_heap_del(&clock->timers, i, &callbacks, NULL);
- break;
- }
-
- spin_unlock(&clock->timer_lock);
-}
-
-struct io_clock_wait {
- struct io_timer io_timer;
- struct task_struct *task;
- int expired;
-};
-
-static void io_clock_wait_fn(struct io_timer *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, io_timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
-{
- struct io_clock_wait wait = {
- .io_timer.expire = until,
- .io_timer.fn = io_clock_wait_fn,
- .io_timer.fn2 = (void *) _RET_IP_,
- .task = current,
- };
-
- bch2_io_timer_add(clock, &wait.io_timer);
- schedule();
- bch2_io_timer_del(clock, &wait.io_timer);
-}
-
-unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
- u64 io_until, unsigned long cpu_timeout)
-{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct io_clock_wait wait = {
- .io_timer.expire = io_until,
- .io_timer.fn = io_clock_wait_fn,
- .io_timer.fn2 = (void *) _RET_IP_,
- .task = current,
- };
-
- bch2_io_timer_add(clock, &wait.io_timer);
-
- set_current_state(TASK_INTERRUPTIBLE);
- if (!(kthread && kthread_should_stop())) {
- cpu_timeout = schedule_timeout(cpu_timeout);
- try_to_freeze();
- }
-
- __set_current_state(TASK_RUNNING);
- bch2_io_timer_del(clock, &wait.io_timer);
- return cpu_timeout;
-}
-
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
- u64 io_until, unsigned long cpu_timeout)
-{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
-
- while (!(kthread && kthread_should_stop()) &&
- cpu_timeout &&
- atomic64_read(&clock->now) < io_until)
- cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
-}
-
-static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
-{
- struct io_timer *ret = NULL;
-
- if (clock->timers.nr &&
- time_after_eq64(now, clock->timers.data[0]->expire)) {
- ret = *min_heap_peek(&clock->timers);
- min_heap_pop(&clock->timers, &callbacks, NULL);
- }
-
- return ret;
-}
-
-void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
-{
- struct io_timer *timer;
- u64 now = atomic64_add_return(sectors, &clock->now);
-
- spin_lock(&clock->timer_lock);
- while ((timer = get_expired_timer(clock, now)))
- timer->fn(timer);
- spin_unlock(&clock->timer_lock);
-}
-
-void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
-{
- out->atomic++;
- spin_lock(&clock->timer_lock);
- u64 now = atomic64_read(&clock->now);
-
- printbuf_tabstop_push(out, 40);
- prt_printf(out, "current time:\t%llu\n", now);
-
- for (unsigned i = 0; i < clock->timers.nr; i++)
- prt_printf(out, "%ps %ps:\t%llu\n",
- clock->timers.data[i]->fn,
- clock->timers.data[i]->fn2,
- clock->timers.data[i]->expire);
- spin_unlock(&clock->timer_lock);
- --out->atomic;
-}
-
-void bch2_io_clock_exit(struct io_clock *clock)
-{
- free_heap(&clock->timers);
- free_percpu(clock->pcpu_buf);
-}
-
-int bch2_io_clock_init(struct io_clock *clock)
-{
- atomic64_set(&clock->now, 0);
- spin_lock_init(&clock->timer_lock);
-
- clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
-
- clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
- if (!clock->pcpu_buf)
- return -BCH_ERR_ENOMEM_io_clock_init;
-
- if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
- return -BCH_ERR_ENOMEM_io_clock_init;
-
- return 0;
-}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
deleted file mode 100644
index 8769be2aa21e..000000000000
--- a/fs/bcachefs/clock.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_H
-#define _BCACHEFS_CLOCK_H
-
-void bch2_io_timer_add(struct io_clock *, struct io_timer *);
-void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
-void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
-
-void __bch2_increment_clock(struct io_clock *, u64);
-
-static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
- int rw)
-{
- struct io_clock *clock = &c->io_clock[rw];
-
- if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
- IO_CLOCK_PCPU_SECTORS))
- __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
-}
-
-void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-
-void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
-
-void bch2_io_clock_exit(struct io_clock *);
-int bch2_io_clock_init(struct io_clock *);
-
-#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
deleted file mode 100644
index 37554e4514fe..000000000000
--- a/fs/bcachefs/clock_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_CLOCK_TYPES_H
-#define _BCACHEFS_CLOCK_TYPES_H
-
-#include "util.h"
-
-#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
-
-/*
- * Clocks/timers in units of sectors of IO:
- *
- * Note - they use percpu batching, so they're only approximate.
- */
-
-struct io_timer;
-typedef void (*io_timer_fn)(struct io_timer *);
-
-struct io_timer {
- io_timer_fn fn;
- void *fn2;
- u64 expire;
-};
-
-/* Amount to buffer up on a percpu counter */
-#define IO_CLOCK_PCPU_SECTORS 128
-
-typedef DEFINE_MIN_HEAP(struct io_timer *, io_timer_heap) io_timer_heap;
-
-struct io_clock {
- atomic64_t now;
- u16 __percpu *pcpu_buf;
- unsigned max_slop;
-
- spinlock_t timer_lock;
- io_timer_heap timers;
-};
-
-#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
deleted file mode 100644
index b37b1f325f0a..000000000000
--- a/fs/bcachefs/compress.c
+++ /dev/null
@@ -1,773 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "checksum.h"
-#include "compress.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "opts.h"
-#include "super-io.h"
-
-#include <linux/lz4.h>
-#include <linux/zlib.h>
-#include <linux/zstd.h>
-
-static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
-{
- switch (type) {
- case BCH_COMPRESSION_TYPE_none:
- case BCH_COMPRESSION_TYPE_incompressible:
- return BCH_COMPRESSION_OPT_none;
- case BCH_COMPRESSION_TYPE_lz4_old:
- case BCH_COMPRESSION_TYPE_lz4:
- return BCH_COMPRESSION_OPT_lz4;
- case BCH_COMPRESSION_TYPE_gzip:
- return BCH_COMPRESSION_OPT_gzip;
- case BCH_COMPRESSION_TYPE_zstd:
- return BCH_COMPRESSION_OPT_zstd;
- default:
- BUG();
- }
-}
-
-/* Bounce buffer: */
-struct bbuf {
- void *b;
- enum {
- BB_NONE,
- BB_VMAP,
- BB_KMALLOC,
- BB_MEMPOOL,
- } type;
- int rw;
-};
-
-static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
-{
- void *b;
-
- BUG_ON(size > c->opts.encoded_extent_max);
-
- b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
- if (b)
- return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
-
- b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
- if (b)
- return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
- BUG();
-}
-
-static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- void *expected_start = NULL;
-
- __bio_for_each_bvec(bv, bio, iter, start) {
- if (expected_start &&
- expected_start != page_address(bv.bv_page) + bv.bv_offset)
- return false;
-
- expected_start = page_address(bv.bv_page) +
- bv.bv_offset + bv.bv_len;
- }
-
- return true;
-}
-
-static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
- struct bvec_iter start, int rw)
-{
- struct bbuf ret;
- struct bio_vec bv;
- struct bvec_iter iter;
- unsigned nr_pages = 0;
- struct page *stack_pages[16];
- struct page **pages = NULL;
- void *data;
-
- BUG_ON(start.bi_size > c->opts.encoded_extent_max);
-
- if (!PageHighMem(bio_iter_page(bio, start)) &&
- bio_phys_contig(bio, start))
- return (struct bbuf) {
- .b = page_address(bio_iter_page(bio, start)) +
- bio_iter_offset(bio, start),
- .type = BB_NONE, .rw = rw
- };
-
- /* check if we can map the pages contiguously: */
- __bio_for_each_segment(bv, bio, iter, start) {
- if (iter.bi_size != start.bi_size &&
- bv.bv_offset)
- goto bounce;
-
- if (bv.bv_len < iter.bi_size &&
- bv.bv_offset + bv.bv_len < PAGE_SIZE)
- goto bounce;
-
- nr_pages++;
- }
-
- BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
-
- pages = nr_pages > ARRAY_SIZE(stack_pages)
- ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
- : stack_pages;
- if (!pages)
- goto bounce;
-
- nr_pages = 0;
- __bio_for_each_segment(bv, bio, iter, start)
- pages[nr_pages++] = bv.bv_page;
-
- data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (pages != stack_pages)
- kfree(pages);
-
- if (data)
- return (struct bbuf) {
- .b = data + bio_iter_offset(bio, start),
- .type = BB_VMAP, .rw = rw
- };
-bounce:
- ret = __bounce_alloc(c, start.bi_size, rw);
-
- if (rw == READ)
- memcpy_from_bio(ret.b, bio, start);
-
- return ret;
-}
-
-static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
-{
- return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
-}
-
-static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
-{
- switch (buf.type) {
- case BB_NONE:
- break;
- case BB_VMAP:
- vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
- break;
- case BB_KMALLOC:
- kfree(buf.b);
- break;
- case BB_MEMPOOL:
- mempool_free(buf.b, &c->compression_bounce[buf.rw]);
- break;
- }
-}
-
-static inline void zlib_set_workspace(z_stream *strm, void *workspace)
-{
-#ifdef __KERNEL__
- strm->workspace = workspace;
-#endif
-}
-
-static int __bio_uncompress(struct bch_fs *c, struct bio *src,
- void *dst_data, struct bch_extent_crc_unpacked crc)
-{
- struct bbuf src_data = { NULL };
- size_t src_len = src->bi_iter.bi_size;
- size_t dst_len = crc.uncompressed_size << 9;
- void *workspace;
- int ret = 0, ret2;
-
- enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
- mempool_t *workspace_pool = &c->compress_workspace[opt];
- if (unlikely(!mempool_initialized(workspace_pool))) {
- if (fsck_err(c, compression_type_not_marked_in_sb,
- "compression type %s set but not marked in superblock",
- __bch2_compression_types[crc.compression_type]))
- ret = bch2_check_set_has_compressed_data(c, opt);
- else
- ret = bch_err_throw(c, compression_workspace_not_initialized);
- if (ret)
- goto err;
- }
-
- src_data = bio_map_or_bounce(c, src, READ);
-
- switch (crc.compression_type) {
- case BCH_COMPRESSION_TYPE_lz4_old:
- case BCH_COMPRESSION_TYPE_lz4:
- ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
- src_len, dst_len, dst_len);
- if (ret2 != dst_len)
- ret = bch_err_throw(c, decompress_lz4);
- break;
- case BCH_COMPRESSION_TYPE_gzip: {
- z_stream strm = {
- .next_in = src_data.b,
- .avail_in = src_len,
- .next_out = dst_data,
- .avail_out = dst_len,
- };
-
- workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
- zlib_set_workspace(&strm, workspace);
- zlib_inflateInit2(&strm, -MAX_WBITS);
- ret2 = zlib_inflate(&strm, Z_FINISH);
-
- mempool_free(workspace, workspace_pool);
-
- if (ret2 != Z_STREAM_END)
- ret = bch_err_throw(c, decompress_gzip);
- break;
- }
- case BCH_COMPRESSION_TYPE_zstd: {
- ZSTD_DCtx *ctx;
- size_t real_src_len = le32_to_cpup(src_data.b);
-
- if (real_src_len > src_len - 4) {
- ret = bch_err_throw(c, decompress_zstd_src_len_bad);
- goto err;
- }
-
- workspace = mempool_alloc(workspace_pool, GFP_NOFS);
- ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
-
- ret2 = zstd_decompress_dctx(ctx,
- dst_data, dst_len,
- src_data.b + 4, real_src_len);
-
- mempool_free(workspace, workspace_pool);
-
- if (ret2 != dst_len)
- ret = bch_err_throw(c, decompress_zstd);
- break;
- }
- default:
- BUG();
- }
-err:
-fsck_err:
- bio_unmap_or_unbounce(c, src_data);
- return ret;
-}
-
-int bch2_bio_uncompress_inplace(struct bch_write_op *op,
- struct bio *bio)
-{
- struct bch_fs *c = op->c;
- struct bch_extent_crc_unpacked *crc = &op->crc;
- struct bbuf data = { NULL };
- size_t dst_len = crc->uncompressed_size << 9;
- int ret = 0;
-
- /* bio must own its pages: */
- BUG_ON(!bio->bi_vcnt);
- BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
-
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
- bch2_write_op_error(op, op->pos.offset,
- "extent too big to decompress (%u > %u)",
- crc->uncompressed_size << 9, c->opts.encoded_extent_max);
- return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
- }
-
- data = __bounce_alloc(c, dst_len, WRITE);
-
- ret = __bio_uncompress(c, bio, data.b, *crc);
-
- if (c->opts.no_data_io)
- ret = 0;
-
- if (ret) {
- bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
- goto err;
- }
-
- /*
- * XXX: don't have a good way to assert that the bio was allocated with
- * enough space, we depend on bch2_move_extent doing the right thing
- */
- bio->bi_iter.bi_size = crc->live_size << 9;
-
- memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
-
- crc->csum_type = 0;
- crc->compression_type = 0;
- crc->compressed_size = crc->live_size;
- crc->uncompressed_size = crc->live_size;
- crc->offset = 0;
- crc->csum = (struct bch_csum) { 0, 0 };
-err:
- bio_unmap_or_unbounce(c, data);
- return ret;
-}
-
-int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
- struct bio *dst, struct bvec_iter dst_iter,
- struct bch_extent_crc_unpacked crc)
-{
- struct bbuf dst_data = { NULL };
- size_t dst_len = crc.uncompressed_size << 9;
- int ret;
-
- if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
- crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
-
- dst_data = dst_len == dst_iter.bi_size
- ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
- : __bounce_alloc(c, dst_len, WRITE);
-
- ret = __bio_uncompress(c, src, dst_data.b, crc);
- if (ret)
- goto err;
-
- if (dst_data.type != BB_NONE &&
- dst_data.type != BB_VMAP)
- memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
-err:
- bio_unmap_or_unbounce(c, dst_data);
- return ret;
-}
-
-static int attempt_compress(struct bch_fs *c,
- void *workspace,
- void *dst, size_t dst_len,
- void *src, size_t src_len,
- struct bch_compression_opt compression)
-{
- enum bch_compression_type compression_type =
- __bch2_compression_opt_to_type[compression.type];
-
- switch (compression_type) {
- case BCH_COMPRESSION_TYPE_lz4:
- if (compression.level < LZ4HC_MIN_CLEVEL) {
- int len = src_len;
- int ret = LZ4_compress_destSize(
- src, dst,
- &len, dst_len,
- workspace);
- if (len < src_len)
- return -len;
-
- return ret;
- } else {
- int ret = LZ4_compress_HC(
- src, dst,
- src_len, dst_len,
- compression.level,
- workspace);
-
- return ret ?: -1;
- }
- case BCH_COMPRESSION_TYPE_gzip: {
- z_stream strm = {
- .next_in = src,
- .avail_in = src_len,
- .next_out = dst,
- .avail_out = dst_len,
- };
-
- zlib_set_workspace(&strm, workspace);
- if (zlib_deflateInit2(&strm,
- compression.level
- ? clamp_t(unsigned, compression.level,
- Z_BEST_SPEED, Z_BEST_COMPRESSION)
- : Z_DEFAULT_COMPRESSION,
- Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY) != Z_OK)
- return 0;
-
- if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
- return 0;
-
- if (zlib_deflateEnd(&strm) != Z_OK)
- return 0;
-
- return strm.total_out;
- }
- case BCH_COMPRESSION_TYPE_zstd: {
- /*
- * rescale:
- * zstd max compression level is 22, our max level is 15
- */
- unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
- ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
- ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
-
- /*
- * ZSTD requires that when we decompress we pass in the exact
- * compressed size - rounding it up to the nearest sector
- * doesn't work, so we use the first 4 bytes of the buffer for
- * that.
- *
- * Additionally, the ZSTD code seems to have a bug where it will
- * write just past the end of the buffer - so subtract a fudge
- * factor (7 bytes) from the dst buffer size to account for
- * that.
- */
- size_t len = zstd_compress_cctx(ctx,
- dst + 4, dst_len - 4 - 7,
- src, src_len,
- &params);
- if (zstd_is_error(len))
- return 0;
-
- *((__le32 *) dst) = cpu_to_le32(len);
- return len + 4;
- }
- default:
- BUG();
- }
-}
-
-static unsigned __bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- struct bch_compression_opt compression)
-{
- struct bbuf src_data = { NULL }, dst_data = { NULL };
- void *workspace;
- enum bch_compression_type compression_type =
- __bch2_compression_opt_to_type[compression.type];
- unsigned pad;
- int ret = 0;
-
- /* bch2_compression_decode catches unknown compression types: */
- BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
-
- mempool_t *workspace_pool = &c->compress_workspace[compression.type];
- if (unlikely(!mempool_initialized(workspace_pool))) {
- if (fsck_err(c, compression_opt_not_marked_in_sb,
- "compression opt %s set but not marked in superblock",
- bch2_compression_opts[compression.type])) {
- ret = bch2_check_set_has_compressed_data(c, compression.type);
- if (ret) /* memory allocation failure, don't compress */
- return 0;
- } else {
- return 0;
- }
- }
-
- /* If it's only one block, don't bother trying to compress: */
- if (src->bi_iter.bi_size <= c->opts.block_size)
- return BCH_COMPRESSION_TYPE_incompressible;
-
- dst_data = bio_map_or_bounce(c, dst, WRITE);
- src_data = bio_map_or_bounce(c, src, READ);
-
- workspace = mempool_alloc(workspace_pool, GFP_NOFS);
-
- *src_len = src->bi_iter.bi_size;
- *dst_len = dst->bi_iter.bi_size;
-
- /*
- * XXX: this algorithm sucks when the compression code doesn't tell us
- * how much would fit, like LZ4 does:
- */
- while (1) {
- if (*src_len <= block_bytes(c)) {
- ret = -1;
- break;
- }
-
- ret = attempt_compress(c, workspace,
- dst_data.b, *dst_len,
- src_data.b, *src_len,
- compression);
- if (ret > 0) {
- *dst_len = ret;
- ret = 0;
- break;
- }
-
- /* Didn't fit: should we retry with a smaller amount? */
- if (*src_len <= *dst_len) {
- ret = -1;
- break;
- }
-
- /*
- * If ret is negative, it's a hint as to how much data would fit
- */
- BUG_ON(-ret >= *src_len);
-
- if (ret < 0)
- *src_len = -ret;
- else
- *src_len -= (*src_len - *dst_len) / 2;
- *src_len = round_down(*src_len, block_bytes(c));
- }
-
- mempool_free(workspace, workspace_pool);
-
- if (ret)
- goto err;
-
- /* Didn't get smaller: */
- if (round_up(*dst_len, block_bytes(c)) >= *src_len)
- goto err;
-
- pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
-
- memset(dst_data.b + *dst_len, 0, pad);
- *dst_len += pad;
-
- if (dst_data.type != BB_NONE &&
- dst_data.type != BB_VMAP)
- memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
-
- BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
- BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
- BUG_ON(*dst_len & (block_bytes(c) - 1));
- BUG_ON(*src_len & (block_bytes(c) - 1));
- ret = compression_type;
-out:
- bio_unmap_or_unbounce(c, src_data);
- bio_unmap_or_unbounce(c, dst_data);
- return ret;
-err:
- ret = BCH_COMPRESSION_TYPE_incompressible;
- goto out;
-fsck_err:
- ret = 0;
- goto out;
-}
-
-unsigned bch2_bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- unsigned compression_opt)
-{
- unsigned orig_dst = dst->bi_iter.bi_size;
- unsigned orig_src = src->bi_iter.bi_size;
- unsigned compression_type;
-
- /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
- src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
- c->opts.encoded_extent_max);
- /* Don't generate a bigger output than input: */
- dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-
- compression_type =
- __bio_compress(c, dst, dst_len, src, src_len,
- bch2_compression_decode(compression_opt));
-
- dst->bi_iter.bi_size = orig_dst;
- src->bi_iter.bi_size = orig_src;
- return compression_type;
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *, u64);
-
-#define BCH_FEATURE_none 0
-
-static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-#undef BCH_FEATURE_none
-
-static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
-{
- int ret = 0;
-
- if ((c->sb.features & f) == f)
- return 0;
-
- mutex_lock(&c->sb_lock);
-
- if ((c->sb.features & f) == f) {
- mutex_unlock(&c->sb_lock);
- return 0;
- }
-
- ret = __bch2_fs_compress_init(c, c->sb.features|f);
- if (ret) {
- mutex_unlock(&c->sb_lock);
- return ret;
- }
-
- c->disk_sb.sb->features[0] |= cpu_to_le64(f);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
- unsigned compression_opt)
-{
- unsigned compression_type = bch2_compression_decode(compression_opt).type;
-
- BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
-
- return compression_type
- ? __bch2_check_set_has_compressed_data(c,
- 1ULL << bch2_compression_opt_to_feature[compression_type])
- : 0;
-}
-
-void bch2_fs_compress_exit(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
- mempool_exit(&c->compress_workspace[i]);
- mempool_exit(&c->compression_bounce[WRITE]);
- mempool_exit(&c->compression_bounce[READ]);
-}
-
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
-{
- ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
- c->opts.encoded_extent_max);
-
- c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
-
- struct {
- unsigned feature;
- enum bch_compression_opts type;
- size_t compress_workspace;
- } compression_types[] = {
- { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
- { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
- max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
- zlib_inflate_workspacesize()) },
- { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
- max(c->zstd_workspace_size,
- zstd_dctx_workspace_bound()) },
- }, *i;
- bool have_compressed = false;
-
- for (i = compression_types;
- i < compression_types + ARRAY_SIZE(compression_types);
- i++)
- have_compressed |= (features & (1 << i->feature)) != 0;
-
- if (!have_compressed)
- return 0;
-
- if (!mempool_initialized(&c->compression_bounce[READ]) &&
- mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max))
- return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
-
- if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
- mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max))
- return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
-
- for (i = compression_types;
- i < compression_types + ARRAY_SIZE(compression_types);
- i++) {
- if (!(features & (1 << i->feature)))
- continue;
-
- if (mempool_initialized(&c->compress_workspace[i->type]))
- continue;
-
- if (mempool_init_kvmalloc_pool(
- &c->compress_workspace[i->type],
- 1, i->compress_workspace))
- return bch_err_throw(c, ENOMEM_compression_workspace_init);
- }
-
- return 0;
-}
-
-static u64 compression_opt_to_feature(unsigned v)
-{
- unsigned type = bch2_compression_decode(v).type;
-
- return BIT_ULL(bch2_compression_opt_to_feature[type]);
-}
-
-int bch2_fs_compress_init(struct bch_fs *c)
-{
- u64 f = c->sb.features;
-
- f |= compression_opt_to_feature(c->opts.compression);
- f |= compression_opt_to_feature(c->opts.background_compression);
-
- return __bch2_fs_compress_init(c, f);
-}
-
-int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
- struct printbuf *err)
-{
- char *val = kstrdup(_val, GFP_KERNEL);
- char *p = val, *type_str, *level_str;
- struct bch_compression_opt opt = { 0 };
- int ret;
-
- if (!val)
- return -ENOMEM;
-
- type_str = strsep(&p, ":");
- level_str = p;
-
- ret = match_string(bch2_compression_opts, -1, type_str);
- if (ret < 0 && err)
- prt_printf(err, "invalid compression type\n");
- if (ret < 0)
- goto err;
-
- opt.type = ret;
-
- if (level_str) {
- unsigned level;
-
- ret = kstrtouint(level_str, 10, &level);
- if (!ret && !opt.type && level)
- ret = -EINVAL;
- if (!ret && level > 15)
- ret = -EINVAL;
- if (ret < 0 && err)
- prt_printf(err, "invalid compression level\n");
- if (ret < 0)
- goto err;
-
- opt.level = level;
- }
-
- *res = bch2_compression_encode(opt);
-err:
- kfree(val);
- return ret;
-}
-
-void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
-{
- struct bch_compression_opt opt = bch2_compression_decode(v);
-
- if (opt.type < BCH_COMPRESSION_OPT_NR)
- prt_str(out, bch2_compression_opts[opt.type]);
- else
- prt_printf(out, "(unknown compression opt %u)", opt.type);
- if (opt.level)
- prt_printf(out, ":%u", opt.level);
-}
-
-void bch2_opt_compression_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
-{
- return bch2_compression_opt_to_text(out, v);
-}
-
-int bch2_opt_compression_validate(u64 v, struct printbuf *err)
-{
- if (!bch2_compression_opt_valid(v)) {
- prt_printf(err, "invalid compression opt %llu", v);
- return -BCH_ERR_invalid_sb_opt_compression;
- }
-
- return 0;
-}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
deleted file mode 100644
index bec2f05bfd52..000000000000
--- a/fs/bcachefs/compress.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COMPRESS_H
-#define _BCACHEFS_COMPRESS_H
-
-#include "extents_types.h"
-
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
-struct bch_compression_opt {
- u8 type:4,
- level:4;
-};
-
-static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
-{
- return (struct bch_compression_opt) {
- .type = v & 15,
- .level = v >> 4,
- };
-}
-
-static inline bool bch2_compression_opt_valid(unsigned v)
-{
- struct bch_compression_opt opt = __bch2_compression_decode(v);
-
- return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
-}
-
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
-{
- return bch2_compression_opt_valid(v)
- ? __bch2_compression_decode(v)
- : (struct bch_compression_opt) { 0 };
-}
-
-static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
-{
- return opt.type|(opt.level << 4);
-}
-
-static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
-{
- return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
-}
-
-struct bch_write_op;
-int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
-int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
- struct bvec_iter, struct bch_extent_crc_unpacked);
-unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
- struct bio *, size_t *, unsigned);
-
-int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
-void bch2_fs_compress_exit(struct bch_fs *);
-int bch2_fs_compress_init(struct bch_fs *);
-
-void bch2_compression_opt_to_text(struct printbuf *, u64);
-
-int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-int bch2_opt_compression_validate(u64, struct printbuf *);
-
-#define bch2_opt_compression (struct bch_opt_fn) { \
- .parse = bch2_opt_compression_parse, \
- .to_text = bch2_opt_compression_to_text, \
- .validate = bch2_opt_compression_validate, \
-}
-
-#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
deleted file mode 100644
index e86d36d23e9e..000000000000
--- a/fs/bcachefs/darray.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include "darray.h"
-
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
-{
- if (new_size > d->size) {
- new_size = roundup_pow_of_two(new_size);
-
- /*
- * This is a workaround: kvmalloc() doesn't support > INT_MAX
- * allocations, but vmalloc() does.
- * The limit needs to be lifted from kvmalloc, and when it does
- * we'll go back to just using that.
- */
- size_t bytes;
- if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
- return -ENOMEM;
-
- void *data = likely(bytes < INT_MAX)
- ? kvmalloc_noprof(bytes, gfp)
- : vmalloc_noprof(bytes);
- if (!data)
- return -ENOMEM;
-
- if (d->size)
- memcpy(data, d->data, d->size * element_size);
- if (d->data != d->preallocated)
- kvfree(d->data);
- d->data = data;
- d->size = new_size;
- }
-
- return 0;
-}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
deleted file mode 100644
index 4080ee99aadd..000000000000
--- a/fs/bcachefs/darray.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DARRAY_H
-#define _BCACHEFS_DARRAY_H
-
-/*
- * Dynamic arrays:
- *
- * Inspired by CCAN's darray
- */
-
-#include <linux/cleanup.h>
-#include <linux/slab.h>
-
-#define DARRAY_PREALLOCATED(_type, _nr) \
-struct { \
- size_t nr, size; \
- _type *data; \
- _type preallocated[_nr]; \
-}
-
-#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
-
-typedef DARRAY(char) darray_char;
-typedef DARRAY(char *) darray_str;
-typedef DARRAY(const char *) darray_const_str;
-
-typedef DARRAY(u8) darray_u8;
-typedef DARRAY(u16) darray_u16;
-typedef DARRAY(u32) darray_u32;
-typedef DARRAY(u64) darray_u64;
-
-typedef DARRAY(s8) darray_s8;
-typedef DARRAY(s16) darray_s16;
-typedef DARRAY(s32) darray_s32;
-typedef DARRAY(s64) darray_s64;
-
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
-
-#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
-
-#define __darray_resize(_d, _element_size, _new_size, _gfp) \
- (unlikely((_new_size) > (_d)->size) \
- ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
- : 0)
-
-#define darray_resize_gfp(_d, _new_size, _gfp) \
- __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
-
-#define darray_resize(_d, _new_size) \
- darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-
-#define darray_make_room_gfp(_d, _more, _gfp) \
- darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
-
-#define darray_make_room(_d, _more) \
- darray_make_room_gfp(_d, _more, GFP_KERNEL)
-
-#define darray_room(_d) ((_d).size - (_d).nr)
-
-#define darray_top(_d) ((_d).data[(_d).nr])
-
-#define darray_push_gfp(_d, _item, _gfp) \
-({ \
- int _ret = darray_make_room_gfp((_d), 1, _gfp); \
- \
- if (!_ret) \
- (_d)->data[(_d)->nr++] = (_item); \
- _ret; \
-})
-
-#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL)
-
-#define darray_pop(_d) ((_d)->data[--(_d)->nr])
-
-#define darray_first(_d) ((_d).data[0])
-#define darray_last(_d) ((_d).data[(_d).nr - 1])
-
-#define darray_insert_item(_d, pos, _item) \
-({ \
- size_t _pos = (pos); \
- int _ret = darray_make_room((_d), 1); \
- \
- if (!_ret) \
- array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \
- _ret; \
-})
-
-#define darray_remove_item(_d, _pos) \
- array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-
-#define darray_find_p(_d, _i, cond) \
-({ \
- typeof((_d).data) _ret = NULL; \
- \
- darray_for_each(_d, _i) \
- if (cond) { \
- _ret = _i; \
- break; \
- } \
- _ret; \
-})
-
-#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item)
-
-/* Iteration: */
-
-#define __darray_for_each(_d, _i) \
- for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
-
-#define darray_for_each_reverse(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
-
-/* Init/exit */
-
-#define darray_init(_d) \
-do { \
- (_d)->nr = 0; \
- (_d)->size = ARRAY_SIZE((_d)->preallocated); \
- (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \
-} while (0)
-
-#define darray_exit(_d) \
-do { \
- if (!ARRAY_SIZE((_d)->preallocated) || \
- (_d)->data != (_d)->preallocated) \
- kvfree((_d)->data); \
- darray_init(_d); \
-} while (0)
-
-#define DEFINE_DARRAY_CLASS(_type) \
-DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
-
-#define DEFINE_DARRAY(_type) \
-typedef DARRAY(_type) darray_##_type; \
-DEFINE_DARRAY_CLASS(darray_##_type)
-
-#define DEFINE_DARRAY_NAMED(_name, _type) \
-typedef DARRAY(_type) _name; \
-DEFINE_DARRAY_CLASS(_name)
-
-DEFINE_DARRAY_CLASS(darray_char);
-DEFINE_DARRAY_CLASS(darray_str)
-DEFINE_DARRAY_CLASS(darray_const_str)
-
-DEFINE_DARRAY_CLASS(darray_u8)
-DEFINE_DARRAY_CLASS(darray_u16)
-DEFINE_DARRAY_CLASS(darray_u32)
-DEFINE_DARRAY_CLASS(darray_u64)
-
-DEFINE_DARRAY_CLASS(darray_s8)
-DEFINE_DARRAY_CLASS(darray_s16)
-DEFINE_DARRAY_CLASS(darray_s32)
-DEFINE_DARRAY_CLASS(darray_s64)
-
-#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
deleted file mode 100644
index e848e210a9bf..000000000000
--- a/fs/bcachefs/data_update.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extents.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-
-static const char * const bch2_data_update_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_DATA_UPDATE_TYPES()
-#undef x
- NULL
-};
-
-static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
-}
-
-static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
- bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
- }
- return false;
- }
- }
- return true;
-}
-
-static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
-}
-
-static noinline_for_stack
-bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
- const struct bch_extent_ptr *start)
-{
- if (!ctxt) {
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr == start)
- break;
-
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
-
- __bkey_for_each_ptr(start, ptrs.end, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- bool locked;
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- }
- return true;
-}
-
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
-{
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
- return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
- }
-
- return true;
-}
-
-noinline_for_stack
-static void trace_io_move_finish2(struct data_update *u,
- struct bkey_i *new,
- struct bkey_i *insert)
-{
- struct bch_fs *c = u->op.c;
- struct printbuf buf = PRINTBUF;
-
- prt_newline(&buf);
-
- bch2_data_update_to_text(&buf, u);
- prt_newline(&buf);
-
- prt_str_indented(&buf, "new replicas:\t");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
- prt_newline(&buf);
-
- prt_str_indented(&buf, "insert:\t");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- prt_newline(&buf);
-
- trace_io_move_finish(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_io_move_fail2(struct data_update *m,
- struct bkey_s_c new,
- struct bkey_s_c wrote,
- struct bkey_i *insert,
- const char *msg)
-{
- struct bch_fs *c = m->op.c;
- struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- struct printbuf buf = PRINTBUF;
- unsigned rewrites_found = 0;
-
- if (!trace_io_move_fail_enabled())
- return;
-
- prt_str(&buf, msg);
-
- if (insert) {
- const union bch_extent_entry *entry;
- struct bch_extent_ptr *ptr;
- struct extent_ptr_decoded p;
-
- unsigned ptr_bit = 1;
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
- if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
- (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
- !ptr->cached)
- rewrites_found |= ptr_bit;
- ptr_bit <<= 1;
- }
- }
-
- prt_str(&buf, "rewrites found:\t");
- bch2_prt_u64_base2(&buf, rewrites_found);
- prt_newline(&buf);
-
- bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
-
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, new);
-
- prt_str(&buf, "\nwrote: ");
- bch2_bkey_val_to_text(&buf, c, wrote);
-
- if (insert) {
- prt_str(&buf, "\ninsert: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- }
-
- trace_io_move_fail(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_data_update2(struct data_update *m,
- struct bkey_s_c old, struct bkey_s_c k,
- struct bkey_i *insert)
-{
- struct bch_fs *c = m->op.c;
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_data_update(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
- struct bkey_s_c old, struct bkey_s_c k,
- struct bkey_i *insert)
-{
- struct bch_fs *c = m->op.c;
- struct printbuf buf = PRINTBUF;
-
- bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_io_move_created_rebalance(c, buf.buf);
- printbuf_exit(&buf);
-
- this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
-}
-
-noinline_for_stack
-static int data_update_invalid_bkey(struct data_update *m,
- struct bkey_s_c old, struct bkey_s_c k,
- struct bkey_i *insert)
-{
- struct bch_fs *c = m->op.c;
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_str(&buf, "about to insert invalid key in data update path");
- prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- prt_newline(&buf);
-
- bch2_fs_emergency_read_only2(c, &buf);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- return bch_err_throw(c, invalid_bkey);
-}
-
-static int __bch2_data_update_index_update(struct btree_trans *trans,
- struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_iter iter;
- struct data_update *m = container_of(op, struct data_update, op);
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k),
- BTREE_ITER_slots|BTREE_ITER_intent);
-
- while (1) {
- struct bkey_s_c k;
- struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- struct bkey_i *insert = NULL;
- struct bkey_i_extent *new;
- const union bch_extent_entry *entry_c;
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_extent_ptr *ptr;
- const struct bch_extent_ptr *ptr_c;
- struct bpos next_pos;
- bool should_check_enospc;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- unsigned rewrites_found = 0, durability, ptr_bit;
-
- bch2_trans_begin(trans);
-
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
-
- if (!bch2_extents_match(k, old)) {
- trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
- NULL, "no match:");
- goto nowork;
- }
-
- insert = bch2_trans_kmalloc(trans,
- bkey_bytes(k.k) +
- bkey_val_bytes(&new->k) +
- sizeof(struct bch_extent_rebalance));
- ret = PTR_ERR_OR_ZERO(insert);
- if (ret)
- goto err;
-
- bkey_reassemble(insert, k);
-
- new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys));
- bch2_cut_front(iter.pos, &new->k_i);
-
- bch2_cut_front(iter.pos, insert);
- bch2_cut_back(new->k.p, insert);
- bch2_cut_back(insert->k.p, &new->k_i);
-
- /*
- * @old: extent that we read from
- * @insert: key that we're going to update, initialized from
- * extent currently in btree - same as @old unless we raced with
- * other updates
- * @new: extent with new pointers that we'll be adding to @insert
- *
- * Fist, drop rewrite_ptrs from @new:
- */
- ptr_bit = 1;
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
- if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
- (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
- !ptr->cached) {
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), ptr);
- rewrites_found |= ptr_bit;
- }
- ptr_bit <<= 1;
- }
-
- if (m->data_opts.rewrite_ptrs &&
- !rewrites_found &&
- bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
- trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
- goto nowork;
- }
-
- /*
- * A replica that we just wrote might conflict with a replica
- * that we want to keep, due to racing with another move:
- */
-restart_drop_conflicting_replicas:
- extent_for_each_ptr(extent_i_to_s(new), ptr)
- if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
- !ptr_c->cached) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
- goto restart_drop_conflicting_replicas;
- }
-
- if (!bkey_val_u64s(&new->k)) {
- trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
- goto nowork;
- }
-
- /* Now, drop pointers that conflict with what we just wrote: */
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
- if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-
- durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
- bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
-
- /* Now, drop excess replicas: */
- scoped_guard(rcu) {
-restart_drop_extra_replicas:
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
- unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
-
- if (!p.ptr.cached &&
- durability - ptr_durability >= m->op.opts.data_replicas) {
- durability -= ptr_durability;
-
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), &entry->ptr);
- goto restart_drop_extra_replicas;
- }
- }
- }
-
- /* Finally, add the pointers we just wrote: */
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
- bch2_extent_ptr_decoded_append(insert, &p);
-
- bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
-
- ret = bch2_sum_sector_overwrites(trans, &iter, insert,
- &should_check_enospc,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- goto err;
-
- if (disk_sectors_delta > (s64) op->res.sectors) {
- ret = bch2_disk_reservation_add(c, &op->res,
- disk_sectors_delta - op->res.sectors,
- !should_check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto out;
- }
-
- next_pos = insert->k.p;
-
- /*
- * Check for nonce offset inconsistency:
- * This is debug code - we've been seeing this bug rarely, and
- * it's been hard to reproduce, so this should give us some more
- * information when it does occur:
- */
- int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
- (struct bkey_validate_context) {
- .btree = m->btree_id,
- .flags = BCH_VALIDATE_commit,
- });
- if (unlikely(invalid)) {
- ret = data_update_invalid_bkey(m, old, k, insert);
- goto out;
- }
-
- ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
- bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
- bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, bkey_start_pos(&insert->k)) ?:
- bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
- bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- goto err;
-
- if (trace_data_update_enabled())
- trace_data_update2(m, old, k, insert);
-
- if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
- bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
- trace_io_move_created_rebalance2(m, old, k, insert);
-
- ret = bch2_trans_commit(trans, &op->res,
- NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- m->data_opts.btree_insert_flags);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_pos(trans, &iter, next_pos);
-
- this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
- if (trace_io_move_finish_enabled())
- trace_io_move_finish2(m, &new->k_i, insert);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- if (ret)
- break;
-next:
- while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) {
- bch2_keylist_pop_front(&op->insert_keys);
- if (bch2_keylist_empty(&op->insert_keys))
- goto out;
- }
- continue;
-nowork:
- if (m->stats) {
- BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->stats->keys_raced);
- atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->stats->sectors_raced);
- }
-
- count_event(c, io_move_fail);
-
- bch2_btree_iter_advance(trans, &iter);
- goto next;
- }
-out:
- bch2_trans_iter_exit(trans, &iter);
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
- return ret;
-}
-
-int bch2_data_update_index_update(struct bch_write_op *op)
-{
- return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
-}
-
-void bch2_data_update_read_done(struct data_update *m)
-{
- m->read_done = true;
-
- /* write bio must own pages: */
- BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
- m->op.crc = m->rbio.pick.crc;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
- this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
-
- closure_call(&m->op.cl, bch2_write, NULL, NULL);
-}
-
-void bch2_data_update_exit(struct data_update *update)
-{
- struct bch_fs *c = update->op.c;
- struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
-
- bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
- kfree(update->bvecs);
- update->bvecs = NULL;
-
- if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
- bkey_put_dev_refs(c, k);
- bch2_disk_reservation_put(c, &update->op.res);
- bch2_bkey_buf_exit(&update->k, c);
-}
-
-static noinline_for_stack
-int bch2_update_unwritten_extent(struct btree_trans *trans,
- struct data_update *update)
-{
- struct bch_fs *c = update->op.c;
- struct bkey_i_extent *e;
- struct write_point *wp;
- struct closure cl;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- closure_init_stack(&cl);
- bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
-
- while (bpos_lt(update->op.pos, update->k.k->k.p)) {
- unsigned sectors = update->k.k->k.p.offset -
- update->op.pos.offset;
-
- bch2_trans_begin(trans);
-
- bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
- BTREE_ITER_slots);
- ret = lockrestart_do(trans, ({
- k = bch2_btree_iter_peek_slot(trans, &iter);
- bkey_err(k);
- }));
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
- break;
-
- e = bkey_extent_init(update->op.insert_keys.top);
- e->k.p = update->op.pos;
-
- ret = bch2_alloc_sectors_start_trans(trans,
- update->op.target,
- false,
- update->op.write_point,
- &update->op.devs_have,
- update->op.nr_replicas,
- update->op.nr_replicas,
- update->op.watermark,
- 0, &cl, &wp);
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- continue;
- }
-
- bch_err_fn_ratelimited(c, ret);
-
- if (ret)
- break;
-
- sectors = min(sectors, wp->sectors_free);
-
- bch2_key_resize(&e->k, sectors);
-
- bch2_open_bucket_get(c, wp, &update->op.open_buckets);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
- bch2_alloc_sectors_done(c, wp);
-
- update->op.pos.offset += sectors;
-
- extent_for_each_ptr(extent_i_to_s(e), ptr)
- ptr->unwritten = true;
- bch2_keylist_push(&update->op.insert_keys);
-
- ret = __bch2_data_update_index_update(trans, &update->op);
-
- bch2_open_buckets_put(c, &update->op.open_buckets);
-
- if (ret)
- break;
- }
-
- if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- }
-
- return ret;
-}
-
-void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
-
- prt_str_indented(out, "rewrite ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
- prt_newline(out);
-
- prt_str_indented(out, "kill ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->kill_ptrs);
- prt_newline(out);
-
- prt_str_indented(out, "target:\t");
- bch2_target_to_text(out, c, data_opts->target);
- prt_newline(out);
-
- prt_str_indented(out, "compression:\t");
- bch2_compression_opt_to_text(out, io_opts->background_compression);
- prt_newline(out);
-
- prt_str_indented(out, "opts.replicas:\t");
- prt_u64(out, io_opts->data_replicas);
- prt_newline(out);
-
- prt_str_indented(out, "extra replicas:\t");
- prt_u64(out, data_opts->extra_replicas);
- prt_newline(out);
-
- prt_str_indented(out, "scrub:\t");
- prt_u64(out, data_opts->scrub);
-}
-
-void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
-{
- prt_str(out, bch2_data_update_type_strs[m->type]);
- prt_newline(out);
-
- bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
- prt_newline(out);
-
- prt_str_indented(out, "old key:\t");
- bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
-}
-
-void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
-{
- bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
- prt_newline(out);
- printbuf_indent_add(out, 2);
- bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
-
- if (!m->read_done) {
- prt_printf(out, "read:\n");
- printbuf_indent_add(out, 2);
- bch2_read_bio_to_text(out, &m->rbio);
- } else {
- prt_printf(out, "write:\n");
- printbuf_indent_add(out, 2);
- bch2_write_op_to_text(out, &m->op);
- }
- printbuf_indent_sub(out, 4);
-}
-
-int bch2_extent_drop_ptrs(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *n;
- int ret;
-
- n = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- while (data_opts->kill_ptrs) {
- unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
-
- bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
- data_opts->kill_ptrs ^= 1U << drop;
- }
-
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
-
- /*
- * Since we're not inserting through an extent iterator
- * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
- * we aren't using the extent overwrite path to delete, we're
- * just using the normal key deletion path:
- */
- if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents))
- n->k.size = 0;
-
- return bch2_trans_relock(trans) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts,
- unsigned buf_bytes)
-{
- unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
-
- m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
- if (!m->bvecs)
- return -ENOMEM;
-
- bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
- bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
-
- if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
- kfree(m->bvecs);
- m->bvecs = NULL;
- return -ENOMEM;
- }
-
- rbio_init(&m->rbio.bio, c, *io_opts, NULL);
- m->rbio.data_update = true;
- m->rbio.bio.bi_iter.bi_size = buf_bytes;
- m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
- m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
- return 0;
-}
-
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- /* write path might have to decompress data: */
- unsigned buf_bytes = 0;
- bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
- return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
-}
-
-static int can_write_extent(struct bch_fs *c, struct data_update *m)
-{
- if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
- unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
- return bch_err_throw(c, data_update_done_would_block);
-
- unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
- ? m->op.target
- : 0;
- struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
-
- darray_for_each(m->op.devs_have, i)
- __clear_bit(*i, devs.d);
-
- guard(rcu)();
-
- unsigned nr_replicas = 0, i;
- for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
- if (!ca)
- continue;
-
- struct bch_dev_usage usage;
- bch2_dev_usage_read_fast(ca, &usage);
-
- if (!dev_buckets_free(ca, usage, m->op.watermark))
- continue;
-
- nr_replicas += ca->mi.durability;
- if (nr_replicas >= m->op.nr_replicas)
- break;
- }
-
- if (!nr_replicas)
- return bch_err_throw(c, data_update_done_no_rw_devs);
- if (nr_replicas < m->op.nr_replicas)
- return bch_err_throw(c, insufficient_devices);
- return 0;
-}
-
-int bch2_data_update_init(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
- struct data_update *m,
- struct write_point_specifier wp,
- struct bch_io_opts *io_opts,
- struct data_update_opts data_opts,
- enum btree_id btree_id,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- if (k.k->p.snapshot) {
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
- /* Can't repair yet, waiting on other recovery passes */
- return bch_err_throw(c, data_update_done_no_snapshot);
- }
- if (ret < 0)
- return ret;
- if (ret) /* key was deleted */
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- bch_err_throw(c, data_update_done_no_snapshot);
- ret = 0;
- }
-
- bch2_bkey_buf_init(&m->k);
- bch2_bkey_buf_reassemble(&m->k, c, k);
- m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc
- ? BCH_DATA_UPDATE_copygc
- : BCH_DATA_UPDATE_rebalance;
- m->btree_id = btree_id;
- m->data_opts = data_opts;
- m->ctxt = ctxt;
- m->stats = ctxt ? ctxt->stats : NULL;
-
- bch2_write_op_init(&m->op, c, *io_opts);
- m->op.pos = bkey_start_pos(k.k);
- m->op.version = k.k->bversion;
- m->op.target = data_opts.target;
- m->op.write_point = wp;
- m->op.nr_replicas = 0;
- m->op.flags |= BCH_WRITE_pages_stable|
- BCH_WRITE_pages_owned|
- BCH_WRITE_data_encoded|
- BCH_WRITE_move|
- m->data_opts.write_flags;
- m->op.compression_opt = io_opts->background_compression;
- m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
-
- unsigned durability_have = 0, durability_removing = 0;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
- unsigned buf_bytes = 0;
- bool unwritten = false;
-
- unsigned ptr_bit = 1;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (!p.ptr.cached) {
- guard(rcu)();
- if (ptr_bit & m->data_opts.rewrite_ptrs) {
- if (crc_is_compressed(p.crc))
- reserve_sectors += k.k->size;
-
- m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
- durability_removing += bch2_extent_ptr_desired_durability(c, &p);
- } else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
- bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
- durability_have += bch2_extent_ptr_durability(c, &p);
- }
- }
-
- /*
- * op->csum_type is normally initialized from the fs/file's
- * current options - but if an extent is encrypted, we require
- * that it stays encrypted:
- */
- if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
- m->op.nonce = p.crc.nonce + p.crc.offset;
- m->op.csum_type = p.crc.csum_type;
- }
-
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
- m->op.incompressible = true;
-
- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
- unwritten |= p.ptr.unwritten;
-
- ptr_bit <<= 1;
- }
-
- unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
-
- /*
- * If current extent durability is less than io_opts.data_replicas,
- * we're not trying to rereplicate the extent up to data_replicas here -
- * unless extra_replicas was specified
- *
- * Increasing replication is an explicit operation triggered by
- * rereplicate, currently, so that users don't get an unexpected -ENOSPC
- */
- m->op.nr_replicas = min(durability_removing, durability_required) +
- m->data_opts.extra_replicas;
-
- /*
- * If device(s) were set to durability=0 after data was written to them
- * we can end up with a duribilty=0 extent, and the normal algorithm
- * that tries not to increase durability doesn't work:
- */
- if (!(durability_have + durability_removing))
- m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
-
- m->op.nr_replicas_required = m->op.nr_replicas;
-
- /*
- * It might turn out that we don't need any new replicas, if the
- * replicas or durability settings have been changed since the extent
- * was written:
- */
- if (!m->op.nr_replicas) {
- m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
- m->data_opts.rewrite_ptrs = 0;
- /* if iter == NULL, it's just a promote */
- if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
- if (!ret)
- ret = bch_err_throw(c, data_update_done_no_writes_needed);
- goto out_bkey_buf_exit;
- }
-
- /*
- * Check if the allocation will succeed, to avoid getting an error later
- * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
- * read:
- *
- * This guards against
- * - BCH_WRITE_alloc_nowait allocations failing (promotes)
- * - Destination target full
- * - Device(s) in destination target offline
- * - Insufficient durability available in destination target
- * (i.e. trying to move a durability=2 replica to a target with a
- * single durability=2 device)
- */
- ret = can_write_extent(c, m);
- if (ret)
- goto out_bkey_buf_exit;
-
- if (reserve_sectors) {
- ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
- m->data_opts.extra_replicas
- ? 0
- : BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- goto out_bkey_buf_exit;
- }
-
- if (!bkey_get_dev_refs(c, k)) {
- ret = bch_err_throw(c, data_update_done_no_dev_refs);
- goto out_put_disk_res;
- }
-
- if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, ptrs)) {
- ret = bch_err_throw(c, nocow_lock_blocked);
- goto out_put_dev_refs;
- }
-
- if (unwritten) {
- ret = bch2_update_unwritten_extent(trans, m) ?:
- bch_err_throw(c, data_update_done_unwritten);
- goto out_nocow_unlock;
- }
-
- bch2_trans_unlock(trans);
-
- ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
- if (ret)
- goto out_nocow_unlock;
-
- return 0;
-out_nocow_unlock:
- if (c->opts.nocow_enabled)
- bkey_nocow_unlock(c, k);
-out_put_dev_refs:
- bkey_put_dev_refs(c, k);
-out_put_disk_res:
- bch2_disk_reservation_put(c, &m->op.res);
-out_bkey_buf_exit:
- bch2_bkey_buf_exit(&m->k, c);
- return ret;
-}
-
-void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned ptr_bit = 1;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
- opts->kill_ptrs |= ptr_bit;
- opts->rewrite_ptrs ^= ptr_bit;
- }
-
- ptr_bit <<= 1;
- }
-}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
deleted file mode 100644
index 5e14d13568de..000000000000
--- a/fs/bcachefs/data_update.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _BCACHEFS_DATA_UPDATE_H
-#define _BCACHEFS_DATA_UPDATE_H
-
-#include "bkey_buf.h"
-#include "io_read.h"
-#include "io_write_types.h"
-
-struct moving_context;
-
-struct data_update_opts {
- unsigned rewrite_ptrs;
- unsigned kill_ptrs;
- u16 target;
- u8 extra_replicas;
- unsigned btree_insert_flags;
- unsigned write_flags;
-
- int read_dev;
- bool scrub;
-};
-
-void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
- struct bch_io_opts *, struct data_update_opts *);
-
-#define BCH_DATA_UPDATE_TYPES() \
- x(copygc, 0) \
- x(rebalance, 1) \
- x(promote, 2)
-
-enum bch_data_update_types {
-#define x(n, id) BCH_DATA_UPDATE_##n = id,
- BCH_DATA_UPDATE_TYPES()
-#undef x
-};
-
-struct data_update {
- enum bch_data_update_types type;
- /* extent being updated: */
- bool read_done;
- enum btree_id btree_id;
- struct bkey_buf k;
- struct data_update_opts data_opts;
- struct moving_context *ctxt;
- struct bch_move_stats *stats;
-
- struct bch_read_bio rbio;
- struct bch_write_op op;
- struct bio_vec *bvecs;
-};
-
-struct promote_op {
- struct rcu_head rcu;
- u64 start_time;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- unsigned list_idx;
-#endif
-
- struct rhash_head hash;
- struct bpos pos;
-
- struct work_struct work;
- struct data_update write;
- struct bio_vec bi_inline_vecs[]; /* must be last */
-};
-
-void bch2_data_update_to_text(struct printbuf *, struct data_update *);
-void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
-
-int bch2_data_update_index_update(struct bch_write_op *);
-
-void bch2_data_update_read_done(struct data_update *);
-
-int bch2_extent_drop_ptrs(struct btree_trans *,
- struct btree_iter *,
- struct bkey_s_c,
- struct bch_io_opts *,
- struct data_update_opts *);
-
-int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
- struct bch_io_opts *);
-
-void bch2_data_update_exit(struct data_update *);
-int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
- struct moving_context *,
- struct data_update *,
- struct write_point_specifier,
- struct bch_io_opts *, struct data_update_opts,
- enum btree_id, struct bkey_s_c);
-void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
-
-#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
deleted file mode 100644
index 07c2a0f73cc2..000000000000
--- a/fs/bcachefs/debug.c
+++ /dev/null
@@ -1,996 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Assorted bcachefs debug code
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal_reclaim.h"
-#include "super.h"
-
-#include <linux/console.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-
-static struct dentry *bch_debug;
-
-static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
- struct extent_ptr_decoded pick)
-{
- struct btree *v = c->verify_data;
- struct btree_node *n_ondisk = c->verify_ondisk;
- struct btree_node *n_sorted = c->verify_data->data;
- struct bset *sorted, *inmemory = &b->data->keys;
- struct bio *bio;
- bool failed = false;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
- BCH_DEV_READ_REF_btree_verify_replicas);
- if (!ca)
- return false;
-
- bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_buf_bytes(b)),
- REQ_OP_READ|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
-
- submit_bio_wait(bio);
-
- bio_put(bio);
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_btree_verify_replicas);
-
- memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
-
- v->written = 0;
- if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
- return false;
-
- n_sorted = c->verify_data->data;
- sorted = &n_sorted->keys;
-
- if (inmemory->u64s != sorted->u64s ||
- memcmp(inmemory->start,
- sorted->start,
- vstruct_end(inmemory) - (void *) inmemory->start)) {
- unsigned offset = 0, sectors;
- struct bset *i;
- unsigned j;
-
- console_lock();
-
- printk(KERN_ERR "*** in memory:\n");
- bch2_dump_bset(c, b, inmemory, 0);
-
- printk(KERN_ERR "*** read back in:\n");
- bch2_dump_bset(c, v, sorted, 0);
-
- while (offset < v->written) {
- if (!offset) {
- i = &n_ondisk->keys;
- sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
- c->block_bits;
- } else {
- struct btree_node_entry *bne =
- (void *) n_ondisk + (offset << 9);
- i = &bne->keys;
-
- sectors = vstruct_blocks(bne, c->block_bits) <<
- c->block_bits;
- }
-
- printk(KERN_ERR "*** on disk block %u:\n", offset);
- bch2_dump_bset(c, b, i, offset);
-
- offset += sectors;
- }
-
- for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
- if (inmemory->_data[j] != sorted->_data[j])
- break;
-
- console_unlock();
- bch_err(c, "verify failed at key %u", j);
-
- failed = true;
- }
-
- if (v->written != b->written) {
- bch_err(c, "written wrong: expected %u, got %u",
- b->written, v->written);
- failed = true;
- }
-
- return failed;
-}
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
- struct bkey_ptrs_c ptrs;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
- struct btree *v;
- struct bset *inmemory = &b->data->keys;
- struct bkey_packed *k;
- bool failed = false;
-
- if (c->opts.nochanges)
- return;
-
- bch2_btree_node_io_lock(b);
- mutex_lock(&c->verify_lock);
-
- if (!c->verify_ondisk) {
- c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
- if (!c->verify_ondisk)
- goto out;
- }
-
- if (!c->verify_data) {
- c->verify_data = __bch2_btree_node_mem_alloc(c);
- if (!c->verify_data)
- goto out;
- }
-
- BUG_ON(b->nsets != 1);
-
- for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
- if (k->type == KEY_TYPE_btree_ptr_v2)
- ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
-
- v = c->verify_data;
- bkey_copy(&v->key, &b->key);
- v->c.level = b->c.level;
- v->c.btree_id = b->c.btree_id;
- bch2_btree_keys_init(v);
-
- ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
- bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
- failed |= bch2_btree_verify_replica(c, b, p);
-
- if (failed) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
- printbuf_exit(&buf);
- }
-out:
- mutex_unlock(&c->verify_lock);
- bch2_btree_node_io_unlock(b);
-}
-
-void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
- const struct btree *b)
-{
- struct btree_node *n_ondisk = NULL;
- struct extent_ptr_decoded pick;
- struct bch_dev *ca;
- struct bio *bio = NULL;
- unsigned offset = 0;
- int ret;
-
- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
- prt_printf(out, "error getting device to read from: invalid device\n");
- return;
- }
-
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
- BCH_DEV_READ_REF_btree_node_ondisk_to_text);
- if (!ca) {
- prt_printf(out, "error getting device to read from: not online\n");
- return;
- }
-
- n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
- if (!n_ondisk) {
- prt_printf(out, "memory allocation failure\n");
- goto out;
- }
-
- bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_buf_bytes(b)),
- REQ_OP_READ|REQ_META,
- GFP_NOFS,
- &c->btree_bio);
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
-
- ret = submit_bio_wait(bio);
- if (ret) {
- prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
- goto out;
- }
-
- while (offset < btree_sectors(c)) {
- struct bset *i;
- struct nonce nonce;
- struct bch_csum csum;
- struct bkey_packed *k;
- unsigned sectors;
-
- if (!offset) {
- i = &n_ondisk->keys;
-
- if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
- prt_printf(out, "unknown checksum type at offset %u: %llu\n",
- offset, BSET_CSUM_TYPE(i));
- goto out;
- }
-
- nonce = btree_nonce(i, offset << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
-
- if (bch2_crc_cmp(csum, n_ondisk->csum)) {
- prt_printf(out, "invalid checksum\n");
- goto out;
- }
-
- bset_encrypt(c, i, offset << 9);
-
- sectors = vstruct_sectors(n_ondisk, c->block_bits);
- } else {
- struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
-
- i = &bne->keys;
-
- if (i->seq != n_ondisk->keys.seq)
- break;
-
- if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
- prt_printf(out, "unknown checksum type at offset %u: %llu\n",
- offset, BSET_CSUM_TYPE(i));
- goto out;
- }
-
- nonce = btree_nonce(i, offset << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- if (bch2_crc_cmp(csum, bne->csum)) {
- prt_printf(out, "invalid checksum");
- goto out;
- }
-
- bset_encrypt(c, i, offset << 9);
-
- sectors = vstruct_sectors(bne, c->block_bits);
- }
-
- prt_printf(out, " offset %u version %u, journal seq %llu\n",
- offset,
- le16_to_cpu(i->version),
- le64_to_cpu(i->journal_seq));
- offset += sectors;
-
- printbuf_indent_add(out, 4);
-
- for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
- struct bkey u;
-
- bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
- prt_newline(out);
- }
-
- printbuf_indent_sub(out, 4);
- }
-out:
- if (bio)
- bio_put(bio);
- kvfree(n_ondisk);
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_btree_node_ondisk_to_text);
-}
-
-#ifdef CONFIG_DEBUG_FS
-
-ssize_t bch2_debugfs_flush_buf(struct dump_iter *i)
-{
- if (i->buf.pos) {
- size_t bytes = min_t(size_t, i->buf.pos, i->size);
- int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
-
- i->ret += copied;
- i->ubuf += copied;
- i->size -= copied;
- i->buf.pos -= copied;
- memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
-
- if (i->buf.last_newline >= copied)
- i->buf.last_newline -= copied;
- if (i->buf.last_field >= copied)
- i->buf.last_field -= copied;
-
- if (copied != bytes)
- return -EFAULT;
- }
-
- return i->size ? 0 : i->ret;
-}
-
-static int bch2_dump_open(struct inode *inode, struct file *file)
-{
- struct btree_debug *bd = inode->i_private;
- struct dump_iter *i;
-
- i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
- if (!i)
- return -ENOMEM;
-
- file->private_data = i;
- i->from = POS_MIN;
- i->iter = 0;
- i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
- i->id = bd->id;
- i->buf = PRINTBUF;
-
- return 0;
-}
-
-int bch2_dump_release(struct inode *inode, struct file *file)
-{
- struct dump_iter *i = file->private_data;
-
- printbuf_exit(&i->buf);
- kfree(i);
- return 0;
-}
-
-static ssize_t bch2_read_btree(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- return bch2_debugfs_flush_buf(i) ?:
- bch2_trans_run(i->c,
- for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- bch2_bkey_val_to_text(&i->buf, i->c, k);
- prt_newline(&i->buf);
- bch2_trans_unlock(trans);
- i->from = bpos_successor(iter.pos);
- bch2_debugfs_flush_buf(i);
- }))) ?:
- i->ret;
-}
-
-static const struct file_operations btree_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_read_btree,
-};
-
-static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- ssize_t ret = bch2_debugfs_flush_buf(i);
- if (ret)
- return ret;
-
- if (bpos_eq(SPOS_MAX, i->from))
- return i->ret;
-
- return bch2_trans_run(i->c,
- for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
- bch2_btree_node_to_text(&i->buf, i->c, b);
- i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
- ? bpos_successor(b->key.k.p)
- : b->key.k.p;
-
- drop_locks_do(trans, bch2_debugfs_flush_buf(i));
- }))) ?: i->ret;
-}
-
-static const struct file_operations btree_format_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_read_btree_formats,
-};
-
-static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- return bch2_debugfs_flush_buf(i) ?:
- bch2_trans_run(i->c,
- for_each_btree_key(trans, iter, i->id, i->from,
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- struct btree_path_level *l =
- &btree_iter_path(trans, &iter)->l[0];
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek(&l->iter, l->b);
-
- if (bpos_gt(l->b->key.k.p, i->prev_node)) {
- bch2_btree_node_to_text(&i->buf, i->c, l->b);
- i->prev_node = l->b->key.k.p;
- }
-
- bch2_bfloat_to_text(&i->buf, l->b, _k);
- bch2_trans_unlock(trans);
- i->from = bpos_successor(iter.pos);
- bch2_debugfs_flush_buf(i);
- }))) ?:
- i->ret;
-}
-
-static const struct file_operations bfloat_failed_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_read_bfloat_failed,
-};
-
-static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
- struct btree *b)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- prt_printf(out, "%px ", b);
- bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
- prt_printf(out, "\n");
-
- printbuf_indent_add(out, 2);
-
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
- prt_newline(out);
-
- prt_printf(out, "flags:\t");
- prt_bitflags(out, bch2_btree_node_flags, b->flags);
- prt_newline(out);
-
- prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL);
- prt_printf(out, "written:\t%u\n", b->written);
- prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked));
- prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable);
-
- prt_printf(out, "journal pin %px:\t%llu\n",
- &b->writes[0].journal, b->writes[0].journal.seq);
- prt_printf(out, "journal pin %px:\t%llu\n",
- &b->writes[1].journal, b->writes[1].journal.seq);
-
- prt_printf(out, "ob:\t%u\n", b->ob.nr);
-
- printbuf_indent_sub(out, 2);
-}
-
-static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- bool done = false;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- do {
- ret = bch2_debugfs_flush_buf(i);
- if (ret)
- return ret;
-
- i->buf.atomic++;
- scoped_guard(rcu) {
- struct bucket_table *tbl =
- rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
- if (i->iter < tbl->size) {
- struct rhash_head *pos;
- struct btree *b;
-
- rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
- bch2_cached_btree_node_to_text(&i->buf, c, b);
- i->iter++;
- } else {
- done = true;
- }
- }
- --i->buf.atomic;
- } while (!done);
-
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
-
- if (!ret)
- ret = bch2_debugfs_flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static const struct file_operations cached_btree_nodes_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_cached_btree_nodes_read,
-};
-
-typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
-
-static void list_sort(struct list_head *head, list_cmp_fn cmp)
-{
- struct list_head *pos;
-
- list_for_each(pos, head)
- while (!list_is_last(pos, head) &&
- cmp(pos, pos->next) > 0) {
- struct list_head *pos2, *next = pos->next;
-
- list_del(next);
- list_for_each(pos2, head)
- if (cmp(next, pos2) < 0)
- goto pos_found;
- BUG();
-pos_found:
- list_add_tail(next, pos2);
- }
-}
-
-static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
-{
- return cmp_int(l, r);
-}
-
-static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- struct btree_trans *trans;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-restart:
- seqmutex_lock(&c->btree_trans_lock);
- list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- if ((ulong) trans <= i->iter)
- continue;
-
- i->iter = (ulong) trans;
-
- if (!closure_get_not_zero(&trans->ref))
- continue;
-
- if (!trans->srcu_held) {
- closure_put(&trans->ref);
- continue;
- }
-
- u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
- bch2_btree_trans_to_text(&i->buf, trans);
-
- prt_printf(&i->buf, "backtrace:\n");
- printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
- printbuf_indent_sub(&i->buf, 2);
- prt_newline(&i->buf);
-
- closure_put(&trans->ref);
-
- ret = bch2_debugfs_flush_buf(i);
- if (ret)
- goto unlocked;
-
- if (!seqmutex_relock(&c->btree_trans_lock, seq))
- goto restart;
- }
- seqmutex_unlock(&c->btree_trans_lock);
-unlocked:
- srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
-
- if (!ret)
- ret = bch2_debugfs_flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static const struct file_operations btree_transactions_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_btree_transactions_read,
-};
-
-static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- bool done = false;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- while (1) {
- err = bch2_debugfs_flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- break;
-
- if (done)
- break;
-
- done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
- i->iter++;
- }
-
- if (i->buf.allocation_failure)
- return -ENOMEM;
-
- return i->ret;
-}
-
-static const struct file_operations journal_pins_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_journal_pins_read,
-};
-
-static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (!i->iter) {
- bch2_btree_updates_to_text(&i->buf, c);
- i->iter++;
- }
-
- err = bch2_debugfs_flush_buf(i);
- if (err)
- return err;
-
- if (i->buf.allocation_failure)
- return -ENOMEM;
-
- return i->ret;
-}
-
-static const struct file_operations btree_updates_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_btree_updates_read,
-};
-
-static int btree_transaction_stats_open(struct inode *inode, struct file *file)
-{
- struct bch_fs *c = inode->i_private;
- struct dump_iter *i;
-
- i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
- if (!i)
- return -ENOMEM;
-
- i->iter = 1;
- i->c = c;
- i->buf = PRINTBUF;
- file->private_data = i;
-
- return 0;
-}
-
-static int btree_transaction_stats_release(struct inode *inode, struct file *file)
-{
- struct dump_iter *i = file->private_data;
-
- printbuf_exit(&i->buf);
- kfree(i);
-
- return 0;
-}
-
-static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- while (1) {
- struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
-
- err = bch2_debugfs_flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- break;
-
- if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
- !bch2_btree_transaction_fns[i->iter])
- break;
-
- prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]);
- printbuf_indent_add(&i->buf, 2);
-
- mutex_lock(&s->lock);
-
- prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
-#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
- printbuf_indent_add(&i->buf, 2);
- bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
- printbuf_indent_sub(&i->buf, 2);
-#endif
-
- prt_printf(&i->buf, "Transaction duration:\n");
-
- printbuf_indent_add(&i->buf, 2);
- bch2_time_stats_to_text(&i->buf, &s->duration);
- printbuf_indent_sub(&i->buf, 2);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
- prt_printf(&i->buf, "Lock hold times:\n");
-
- printbuf_indent_add(&i->buf, 2);
- bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
- printbuf_indent_sub(&i->buf, 2);
- }
-
- if (s->max_paths_text) {
- prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths);
-
- printbuf_indent_add(&i->buf, 2);
- prt_str_indented(&i->buf, s->max_paths_text);
- printbuf_indent_sub(&i->buf, 2);
- }
-
- mutex_unlock(&s->lock);
-
- printbuf_indent_sub(&i->buf, 2);
- prt_newline(&i->buf);
- i->iter++;
- }
-
- if (i->buf.allocation_failure)
- return -ENOMEM;
-
- return i->ret;
-}
-
-static const struct file_operations btree_transaction_stats_op = {
- .owner = THIS_MODULE,
- .open = btree_transaction_stats_open,
- .release = btree_transaction_stats_release,
- .read = btree_transaction_stats_read,
-};
-
-/* walk btree transactions until we find a deadlock and print it */
-static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct btree_trans *trans;
- ulong iter = 0;
-restart:
- seqmutex_lock(&c->btree_trans_lock);
- list_sort(&c->btree_trans_list, list_ptr_order_cmp);
-
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- if ((ulong) trans <= iter)
- continue;
-
- iter = (ulong) trans;
-
- if (!closure_get_not_zero(&trans->ref))
- continue;
-
- u32 seq = seqmutex_unlock(&c->btree_trans_lock);
-
- bool found = bch2_check_for_deadlock(trans, out) != 0;
-
- closure_put(&trans->ref);
-
- if (found)
- return;
-
- if (!seqmutex_relock(&c->btree_trans_lock, seq))
- goto restart;
- }
- seqmutex_unlock(&c->btree_trans_lock);
-}
-
-typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
-
-static ssize_t bch2_simple_print(struct file *file, char __user *buf,
- size_t size, loff_t *ppos,
- fs_to_text_fn fn)
-{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
- ssize_t ret = 0;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (!i->iter) {
- fn(&i->buf, c);
- i->iter++;
- }
-
- if (i->buf.allocation_failure)
- ret = -ENOMEM;
-
- if (!ret)
- ret = bch2_debugfs_flush_buf(i);
-
- return ret ?: i->ret;
-}
-
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
-}
-
-static const struct file_operations btree_deadlock_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_btree_deadlock_read,
-};
-
-static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
-}
-
-static const struct file_operations write_points_ops = {
- .owner = THIS_MODULE,
- .open = bch2_dump_open,
- .release = bch2_dump_release,
- .read = bch2_write_points_read,
-};
-
-void bch2_fs_debug_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->fs_debug_dir))
- debugfs_remove_recursive(c->fs_debug_dir);
-}
-
-static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
-{
- struct dentry *d;
-
- d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
-
- debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
-
- debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
-
- debugfs_create_file("bfloat-failed", 0400, d, bd,
- &bfloat_failed_debug_ops);
-}
-
-void bch2_fs_debug_init(struct bch_fs *c)
-{
- struct btree_debug *bd;
- char name[100];
-
- if (IS_ERR_OR_NULL(bch_debug))
- return;
-
- if (c->sb.multi_device)
- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
- else
- strscpy(name, c->name, sizeof(name));
-
- c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
- if (IS_ERR_OR_NULL(c->fs_debug_dir))
- return;
-
- debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
- c->btree_debug, &cached_btree_nodes_ops);
-
- debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
- c->btree_debug, &btree_transactions_ops);
-
- debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
- c->btree_debug, &journal_pins_ops);
-
- debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
- c->btree_debug, &btree_updates_ops);
-
- debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
- c, &btree_transaction_stats_op);
-
- debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
- c->btree_debug, &btree_deadlock_ops);
-
- debugfs_create_file("write_points", 0400, c->fs_debug_dir,
- c->btree_debug, &write_points_ops);
-
- bch2_fs_async_obj_debugfs_init(c);
-
- c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
- if (IS_ERR_OR_NULL(c->btree_debug_dir))
- return;
-
- for (bd = c->btree_debug;
- bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
- bd++) {
- bd->id = bd - c->btree_debug;
- bch2_fs_debug_btree_init(c, bd);
- }
-}
-
-#endif
-
-void bch2_debug_exit(void)
-{
- if (!IS_ERR_OR_NULL(bch_debug))
- debugfs_remove_recursive(bch_debug);
-}
-
-int __init bch2_debug_init(void)
-{
- bch_debug = debugfs_create_dir("bcachefs", NULL);
- return 0;
-}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
deleted file mode 100644
index d88b1194b8ac..000000000000
--- a/fs/bcachefs/debug.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DEBUG_H
-#define _BCACHEFS_DEBUG_H
-
-#include "bcachefs.h"
-
-struct bio;
-struct btree;
-struct bch_fs;
-
-void __bch2_btree_verify(struct bch_fs *, struct btree *);
-void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
- const struct btree *);
-
-static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
-{
- if (static_branch_unlikely(&bch2_verify_btree_ondisk))
- __bch2_btree_verify(c, b);
-}
-
-#ifdef CONFIG_DEBUG_FS
-struct dump_iter {
- struct bch_fs *c;
- struct async_obj_list *list;
- enum btree_id id;
- struct bpos from;
- struct bpos prev_node;
- u64 iter;
-
- struct printbuf buf;
-
- char __user *ubuf; /* destination user buffer */
- size_t size; /* size of requested read */
- ssize_t ret; /* bytes read so far */
-};
-
-ssize_t bch2_debugfs_flush_buf(struct dump_iter *);
-int bch2_dump_release(struct inode *, struct file *);
-
-void bch2_fs_debug_exit(struct bch_fs *);
-void bch2_fs_debug_init(struct bch_fs *);
-#else
-static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
-static inline void bch2_fs_debug_init(struct bch_fs *c) {}
-#endif
-
-void bch2_debug_exit(void);
-int bch2_debug_init(void);
-
-#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
deleted file mode 100644
index 28875c5c86ad..000000000000
--- a/fs/bcachefs/dirent.c
+++ /dev/null
@@ -1,766 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "dirent.h"
-#include "fs.h"
-#include "keylist.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-#include <linux/dcache.h>
-
-#ifdef CONFIG_UNICODE
-int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
-
- if (!bch2_fs_casefold_enabled(trans->c))
- return -EOPNOTSUPP;
-
- unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
- int ret = PTR_ERR_OR_ZERO(buf);
- if (ret)
- return ret;
-
- ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
- if (ret <= 0)
- return ret;
-
- *out_cf = (struct qstr) QSTR_INIT(buf, ret);
- return 0;
-}
-#endif
-
-static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
-{
- if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
- return 0;
-
- unsigned bkey_u64s = bkey_val_u64s(d.k);
- unsigned bkey_bytes = bkey_u64s * sizeof(u64);
- u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
-#if CPU_BIG_ENDIAN
- unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
-#else
- unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
-#endif
-
- return bkey_bytes -
- (d.v->d_casefold
- ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
- : offsetof(struct bch_dirent, d_name)) -
- trailing_nuls;
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
-{
- if (d.v->d_casefold) {
- unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
- return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
- } else {
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
- }
-}
-
-static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
-{
- if (d.v->d_casefold) {
- unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
- unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
- return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
- } else {
- return (struct qstr) QSTR_INIT(NULL, 0);
- }
-}
-
-static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
-{
- return d.v->d_casefold
- ? bch2_dirent_get_casefold_name(d)
- : bch2_dirent_get_name(d);
-}
-
-static u64 bch2_dirent_hash(const struct bch_hash_info *info,
- const struct qstr *name)
-{
- struct bch_str_hash_ctx ctx;
-
- bch2_str_hash_init(&ctx, info);
- bch2_str_hash_update(&ctx, info, name->name, name->len);
-
- /* [0,2) reserved for dots */
- return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
-}
-
-static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
-{
- return bch2_dirent_hash(info, key);
-}
-
-static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_lookup_name(d);
-
- return bch2_dirent_hash(info, &name);
-}
-
-static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
-{
- struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- const struct qstr l_name = bch2_dirent_get_lookup_name(l);
- const struct qstr *r_name = _r;
-
- return !qstr_eq(l_name, *r_name);
-}
-
-static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
- struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- const struct qstr l_name = bch2_dirent_get_lookup_name(l);
- const struct qstr r_name = bch2_dirent_get_lookup_name(r);
-
- return !qstr_eq(l_name, r_name);
-}
-
-static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- if (d.v->d_type == DT_SUBVOL)
- return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
- return true;
-}
-
-const struct bch_hash_desc bch2_dirent_hash_desc = {
- .btree_id = BTREE_ID_dirents,
- .key_type = KEY_TYPE_dirent,
- .hash_key = dirent_hash_key,
- .hash_bkey = dirent_hash_bkey,
- .cmp_key = dirent_cmp_key,
- .cmp_bkey = dirent_cmp_bkey,
- .is_visible = dirent_is_visible,
-};
-
-int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- unsigned name_block_len = bch2_dirent_name_bytes(d);
- struct qstr d_name = bch2_dirent_get_name(d);
- struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
- int ret = 0;
-
- bkey_fsck_err_on(!d_name.len,
- c, dirent_empty_name,
- "empty name");
-
- bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
- c, dirent_val_too_big,
- "dirent names exceed bkey size (%d + %d > %d)",
- d_name.len, d_cf_name.len, name_block_len);
-
- /*
- * Check new keys don't exceed the max length
- * (older keys may be larger.)
- */
- bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
- c, dirent_name_too_long,
- "dirent name too big (%u > %u)",
- d_name.len, BCH_NAME_MAX);
-
- bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
- c, dirent_name_embedded_nul,
- "dirent has stray data after name's NUL");
-
- bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
- (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
- c, dirent_name_dot_or_dotdot,
- "invalid name");
-
- bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
- c, dirent_name_has_slash,
- "name with /");
-
- bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode,
- c, dirent_to_itself,
- "dirent points to own directory");
-
- if (d.v->d_casefold) {
- bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
- d_cf_name.len > BCH_NAME_MAX,
- c, dirent_cf_name_too_big,
- "dirent w/ cf name too big (%u > %u)",
- d_cf_name.len, BCH_NAME_MAX);
-
- bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
- c, dirent_stray_data_after_cf_name,
- "dirent has stray data after cf name's NUL");
- }
-fsck_err:
- return ret;
-}
-
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr d_name = bch2_dirent_get_name(d);
-
- prt_printf(out, "%.*s", d_name.len, d_name.name);
-
- if (d.v->d_casefold) {
- struct qstr d_name = bch2_dirent_get_lookup_name(d);
- prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
- }
-
- prt_str(out, " ->");
-
- if (d.v->d_type != DT_SUBVOL)
- prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
- else
- prt_printf(out, " %u -> %u",
- le32_to_cpu(d.v->d_parent_subvol),
- le32_to_cpu(d.v->d_child_subvol));
-
- prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
-}
-
-int bch2_dirent_init_name(struct bch_fs *c,
- struct bkey_i_dirent *dirent,
- const struct bch_hash_info *hash_info,
- const struct qstr *name,
- const struct qstr *cf_name)
-{
- EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
- int cf_len = 0;
-
- if (name->len > BCH_NAME_MAX)
- return -ENAMETOOLONG;
-
- dirent->v.d_casefold = hash_info->cf_encoding != NULL;
-
- if (!dirent->v.d_casefold) {
- memcpy(&dirent->v.d_name[0], name->name, name->len);
- memset(&dirent->v.d_name[name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_name) -
- name->len);
- } else {
- if (!bch2_fs_casefold_enabled(c))
- return -EOPNOTSUPP;
-
-#ifdef CONFIG_UNICODE
- memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
-
- char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
-
- if (cf_name) {
- cf_len = cf_name->len;
-
- memcpy(cf_out, cf_name->name, cf_name->len);
- } else {
- cf_len = utf8_casefold(hash_info->cf_encoding, name,
- cf_out,
- bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
- if (cf_len <= 0)
- return cf_len;
- }
-
- memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_cf_name_block.d_names) -
- name->len + cf_len);
-
- dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
- dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
-
- EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
-#endif
- }
-
- unsigned u64s = dirent_val_u64s(name->len, cf_len);
- BUG_ON(u64s > bkey_val_u64s(&dirent->k));
- set_bkey_val_u64s(&dirent->k, u64s);
- return 0;
-}
-
-struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
- const struct bch_hash_info *hash_info,
- subvol_inum dir,
- u8 type,
- const struct qstr *name,
- const struct qstr *cf_name,
- u64 dst)
-{
- struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
- if (IS_ERR(dirent))
- return dirent;
-
- bkey_dirent_init(&dirent->k_i);
- dirent->k.u64s = BKEY_U64s_MAX;
-
- if (type != DT_SUBVOL) {
- dirent->v.d_inum = cpu_to_le64(dst);
- } else {
- dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
- dirent->v.d_child_subvol = cpu_to_le32(dst);
- }
-
- dirent->v.d_type = type;
- dirent->v.d_unused = 0;
-
- int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name);
- if (ret)
- return ERR_PTR(ret);
-
- EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
- return dirent;
-}
-
-int bch2_dirent_create_snapshot(struct btree_trans *trans,
- u32 dir_subvol, u64 dir, u32 snapshot,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset,
- enum btree_iter_update_trigger_flags flags)
-{
- subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
- struct bkey_i_dirent *dirent;
- int ret;
-
- dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
- ret = PTR_ERR_OR_ZERO(dirent);
- if (ret)
- return ret;
-
- dirent->k.p.inode = dir;
- dirent->k.p.snapshot = snapshot;
-
- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, snapshot, &dirent->k_i, flags);
- *dir_offset = dirent->k.p.offset;
-
- return ret;
-}
-
-int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i_dirent *dirent;
- int ret;
-
- dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
- ret = PTR_ERR_OR_ZERO(dirent);
- if (ret)
- return ret;
-
- ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
- *dir_offset = dirent->k.p.offset;
-
- return ret;
-}
-
-int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
- struct bkey_s_c_dirent d, subvol_inum *target)
-{
- struct bch_subvolume s;
- int ret = 0;
-
- if (d.v->d_type == DT_SUBVOL &&
- le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
- return 1;
-
- if (likely(d.v->d_type != DT_SUBVOL)) {
- target->subvol = dir.subvol;
- target->inum = le64_to_cpu(d.v->d_inum);
- } else {
- target->subvol = le32_to_cpu(d.v->d_child_subvol);
-
- ret = bch2_subvolume_get(trans, target->subvol, true, &s);
-
- target->inum = le64_to_cpu(s.inode);
- }
-
- return ret;
-}
-
-int bch2_dirent_rename(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_hash_info *src_hash,
- subvol_inum dst_dir, struct bch_hash_info *dst_hash,
- const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
- const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
- enum bch_rename_mode mode)
-{
- struct qstr src_name_lookup, dst_name_lookup;
- struct btree_iter src_iter = {};
- struct btree_iter dst_iter = {};
- struct bkey_s_c old_src, old_dst = bkey_s_c_null;
- struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
- struct bpos dst_pos =
- POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
- unsigned src_update_flags = 0;
- bool delete_src, delete_dst;
- int ret = 0;
-
- memset(src_inum, 0, sizeof(*src_inum));
- memset(dst_inum, 0, sizeof(*dst_inum));
-
- /* Lookup src: */
- ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
- if (ret)
- goto out;
- old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, &src_name_lookup,
- BTREE_ITER_intent);
- ret = bkey_err(old_src);
- if (ret)
- goto out;
-
- ret = bch2_dirent_read_target(trans, src_dir,
- bkey_s_c_to_dirent(old_src), src_inum);
- if (ret)
- goto out;
-
- /* Lookup dst: */
- ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
- if (ret)
- goto out;
- if (mode == BCH_RENAME) {
- /*
- * Note that we're _not_ checking if the target already exists -
- * we're relying on the VFS to do that check for us for
- * correctness:
- */
- ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, &dst_name_lookup);
- if (ret)
- goto out;
- } else {
- old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, &dst_name_lookup,
- BTREE_ITER_intent);
- ret = bkey_err(old_dst);
- if (ret)
- goto out;
-
- ret = bch2_dirent_read_target(trans, dst_dir,
- bkey_s_c_to_dirent(old_dst), dst_inum);
- if (ret)
- goto out;
- }
-
- if (mode != BCH_RENAME_EXCHANGE)
- *src_offset = dst_iter.pos.offset;
-
- /* Create new dst key: */
- new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
- dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
- ret = PTR_ERR_OR_ZERO(new_dst);
- if (ret)
- goto out;
-
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- new_dst->k.p = dst_iter.pos;
-
- /* Create new src key: */
- if (mode == BCH_RENAME_EXCHANGE) {
- new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
- src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
- ret = PTR_ERR_OR_ZERO(new_src);
- if (ret)
- goto out;
-
- dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- new_src->k.p = src_iter.pos;
- } else {
- new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
- ret = PTR_ERR_OR_ZERO(new_src);
- if (ret)
- goto out;
-
- bkey_init(&new_src->k);
- new_src->k.p = src_iter.pos;
-
- if (bkey_le(dst_pos, src_iter.pos) &&
- bkey_lt(src_iter.pos, dst_iter.pos)) {
- /*
- * We have a hash collision for the new dst key,
- * and new_src - the key we're deleting - is between
- * new_dst's hashed slot and the slot we're going to be
- * inserting it into - oops. This will break the hash
- * table if we don't deal with it:
- */
- if (mode == BCH_RENAME) {
- /*
- * If we're not overwriting, we can just insert
- * new_dst at the src position:
- */
- new_src = new_dst;
- new_src->k.p = src_iter.pos;
- goto out_set_src;
- } else {
- /* If we're overwriting, we can't insert new_dst
- * at a different slot because it has to
- * overwrite old_dst - just make sure to use a
- * whiteout when deleting src:
- */
- new_src->k.type = KEY_TYPE_hash_whiteout;
- }
- } else {
- /* Check if we need a whiteout to delete src: */
- ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
- src_hash, &src_iter);
- if (ret < 0)
- goto out;
-
- if (ret)
- new_src->k.type = KEY_TYPE_hash_whiteout;
- }
- }
-
- if (new_dst->v.d_type == DT_SUBVOL)
- new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
-
- if ((mode == BCH_RENAME_EXCHANGE) &&
- new_src->v.d_type == DT_SUBVOL)
- new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
-
- ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
- if (ret)
- goto out;
-out_set_src:
- /*
- * If we're deleting a subvolume we need to really delete the dirent,
- * not just emit a whiteout in the current snapshot - there can only be
- * single dirent that points to a given subvolume.
- *
- * IOW, we don't maintain multiple versions in different snapshots of
- * dirents that point to subvolumes - dirents that point to subvolumes
- * are only visible in one particular subvolume so it's not necessary,
- * and it would be particularly confusing for fsck to have to deal with.
- */
- delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
- new_src->k.p.snapshot != old_src.k->p.snapshot;
-
- delete_dst = old_dst.k &&
- bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
- new_dst->k.p.snapshot != old_dst.k->p.snapshot;
-
- if (!delete_src || !bkey_deleted(&new_src->k)) {
- ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
- if (ret)
- goto out;
- }
-
- if (delete_src) {
- bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(trans, &src_iter) ?:
- bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- goto out;
- }
-
- if (delete_dst) {
- bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
- ret = bch2_btree_iter_traverse(trans, &dst_iter) ?:
- bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- goto out;
- }
-
- if (mode == BCH_RENAME_EXCHANGE)
- *src_offset = new_src->k.p.offset;
- *dst_offset = new_dst->k.p.offset;
-out:
- bch2_trans_iter_exit(trans, &src_iter);
- bch2_trans_iter_exit(trans, &dst_iter);
- return ret;
-}
-
-int bch2_dirent_lookup_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum,
- unsigned flags)
-{
- struct qstr lookup_name;
- int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
- if (ret)
- return ret;
-
- struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, &lookup_name, flags);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
- if (ret > 0)
- ret = -ENOENT;
-err:
- if (ret)
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, subvol_inum *inum)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = {};
-
- int ret = lockrestart_do(trans,
- bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(dir, 0, snapshot),
- POS(dir, U64_MAX), 0, k, ret)
- if (k.k->type == KEY_TYPE_dirent) {
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
- continue;
- ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
-{
- u32 snapshot;
-
- return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
- bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
-}
-
-static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
-{
- struct qstr name = bch2_dirent_get_name(d);
- /*
- * Although not required by the kernel code, updating ctx->pos is needed
- * for the bcachefs FUSE driver. Without this update, the FUSE
- * implementation will be stuck in an infinite loop when reading
- * directories (via the bcachefs_fuse_readdir callback).
- * In kernel space, ctx->pos is updated by the VFS code.
- */
- ctx->pos = d.k->p.offset;
- bool ret = dir_emit(ctx, name.name,
- name.len,
- target.inum,
- vfs_d_type(d.v->d_type));
- if (ret)
- ctx->pos = d.k->p.offset + 1;
- return !ret;
-}
-
-int bch2_readdir(struct bch_fs *c, subvol_inum inum,
- struct bch_hash_info *hash_info,
- struct dir_context *ctx)
-{
- struct bkey_buf sk;
- bch2_bkey_buf_init(&sk);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
- POS(inum.inum, ctx->pos),
- POS(inum.inum, U64_MAX),
- inum.subvol, 0, k, ({
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
-
- subvol_inum target;
-
- bool need_second_pass = false;
- int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
- hash_info, &iter, k, &need_second_pass) ?:
- bch2_dirent_read_target(trans, inum, dirent, &target);
- if (ret2 > 0)
- continue;
-
- ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));
- })));
-
- bch2_bkey_buf_exit(&sk, c);
-
- return ret < 0 ? ret : 0;
-}
-
-/* fsck */
-
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode_nr)
- break;
- if (!bkey_is_inode(k.k))
- continue;
- ret = bch2_inode_unpack(k, inode);
- goto found;
- }
- ret = bch_err_throw(trans->c, ENOENT_inode);
-found:
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bch_inode_unpacked dir_inode;
- struct bch_hash_info dir_hash_info;
- int ret;
-
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
- if (ret)
- goto err;
-
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
deleted file mode 100644
index 0417608c18d5..000000000000
--- a/fs/bcachefs/dirent.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_H
-#define _BCACHEFS_DIRENT_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_dirent_hash_desc;
-
-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
- .key_validate = bch2_dirent_validate, \
- .val_to_text = bch2_dirent_to_text, \
- .min_val_size = 16, \
-})
-
-struct qstr;
-struct file;
-struct dir_context;
-struct bch_fs;
-struct bch_hash_info;
-struct bch_inode_info;
-
-#ifdef CONFIG_UNICODE
-int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
- const struct qstr *, struct qstr *);
-#else
-static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- return -EOPNOTSUPP;
-}
-#endif
-
-static inline int bch2_maybe_casefold(struct btree_trans *trans,
- const struct bch_hash_info *info,
- const struct qstr *str, struct qstr *out_cf)
-{
- if (likely(!info->cf_encoding)) {
- *out_cf = *str;
- return 0;
- } else {
- return bch2_casefold(trans, info, str, out_cf);
- }
-}
-
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
-
-static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
-{
- unsigned bytes = cf_len
- ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
- : offsetof(struct bch_dirent, d_name) + len;
-
- return DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
- struct bkey_s_c_dirent, subvol_inum *);
-
-static inline void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
-int bch2_dirent_init_name(struct bch_fs *,
- struct bkey_i_dirent *,
- const struct bch_hash_info *,
- const struct qstr *,
- const struct qstr *);
-struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
- const struct bch_hash_info *, subvol_inum, u8,
- const struct qstr *, const struct qstr *, u64);
-
-int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
- const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *,
- enum btree_iter_update_trigger_flags);
-int bch2_dirent_create(struct btree_trans *, subvol_inum,
- const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *,
- enum btree_iter_update_trigger_flags);
-
-static inline unsigned vfs_d_type(unsigned type)
-{
- return type == DT_SUBVOL ? DT_DIR : type;
-}
-
-enum bch_rename_mode {
- BCH_RENAME,
- BCH_RENAME_OVERWRITE,
- BCH_RENAME_EXCHANGE,
-};
-
-int bch2_dirent_rename(struct btree_trans *,
- subvol_inum, struct bch_hash_info *,
- subvol_inum, struct bch_hash_info *,
- const struct qstr *, subvol_inum *, u64 *,
- const struct qstr *, subvol_inum *, u64 *,
- enum bch_rename_mode);
-
-int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
- subvol_inum, const struct bch_hash_info *,
- const struct qstr *, subvol_inum *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
- const struct bch_hash_info *,
- const struct qstr *, subvol_inum *);
-
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
-int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
-
-int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
-
-#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
deleted file mode 100644
index a46dbddd21aa..000000000000
--- a/fs/bcachefs/dirent_format.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DIRENT_FORMAT_H
-#define _BCACHEFS_DIRENT_FORMAT_H
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u8 d_type:5,
- d_unused:2,
- d_casefold:1;
-#elif defined(__BIG_ENDIAN_BITFIELD)
- __u8 d_casefold:1,
- d_unused:2,
- d_type:5;
-#endif
-
- union {
- struct {
- __u8 d_pad;
- __le16 d_name_len;
- __le16 d_cf_name_len;
- __u8 d_names[];
- } d_cf_name_block __packed;
- __DECLARE_FLEX_ARRAY(__u8, d_name);
- } __packed;
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
deleted file mode 100644
index f7528cd69c73..000000000000
--- a/fs/bcachefs/disk_accounting.c
+++ /dev/null
@@ -1,1074 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "btree_cache.h"
-#include "btree_journal_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-
-/*
- * Notes on disk accounting:
- *
- * We have two parallel sets of counters to be concerned with, and both must be
- * kept in sync.
- *
- * - Persistent/on disk accounting, stored in the accounting btree and updated
- * via btree write buffer updates that treat new accounting keys as deltas to
- * apply to existing values. But reading from a write buffer btree is
- * expensive, so we also have
- *
- * - In memory accounting, where accounting is stored as an array of percpu
- * counters, indexed by an eytzinger array of disk acounting keys/bpos (which
- * are the same thing, excepting byte swabbing on big endian).
- *
- * Cheap to read, but non persistent.
- *
- * Disk accounting updates are generated by transactional triggers; these run as
- * keys enter and leave the btree, and can compare old and new versions of keys;
- * the output of these triggers are deltas to the various counters.
- *
- * Disk accounting updates are done as btree write buffer updates, where the
- * counters in the disk accounting key are deltas that will be applied to the
- * counter in the btree when the key is flushed by the write buffer (or journal
- * replay).
- *
- * To do a disk accounting update:
- * - initialize a disk_accounting_pos, to specify which counter is being update
- * - initialize counter deltas, as an array of 1-3 s64s
- * - call bch2_disk_accounting_mod()
- *
- * This queues up the accounting update to be done at transaction commit time.
- * Underneath, it's a normal btree write buffer update.
- *
- * The transaction commit path is responsible for propagating updates to the in
- * memory counters, with bch2_accounting_mem_mod().
- *
- * The commit path also assigns every disk accounting update a unique version
- * number, based on the journal sequence number and offset within that journal
- * buffer; this is used by journal replay to determine which updates have been
- * done.
- *
- * The transaction commit path also ensures that replicas entry accounting
- * updates are properly marked in the superblock (so that we know whether we can
- * mount without data being unavailable); it will update the superblock if
- * bch2_accounting_mem_mod() tells it to.
- */
-
-static const char * const disk_accounting_type_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_DISK_ACCOUNTING_TYPES()
-#undef x
- NULL
-};
-
-static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos,
- s64 *d, unsigned nr)
-{
- struct bkey_i_accounting *acc = bkey_accounting_init(k);
-
- acc->k.p = pos;
- set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
-
- memcpy_u64s_small(acc->v.d, d, nr);
-}
-
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
- s64 *d, unsigned nr)
-{
- return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr);
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
-
-int bch2_disk_accounting_mod(struct btree_trans *trans,
- struct disk_accounting_pos *k,
- s64 *d, unsigned nr, bool gc)
-{
- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
-
- /* Normalize: */
- switch (k->type) {
- case BCH_DISK_ACCOUNTING_replicas:
- bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
- break;
- }
-
- struct bpos pos = disk_accounting_pos_to_bpos(k);
-
- if (likely(!gc)) {
- struct bkey_i_accounting *a;
-#if 0
- for (a = btree_trans_subbuf_base(trans, &trans->accounting);
- a != btree_trans_subbuf_top(trans, &trans->accounting);
- a = (void *) bkey_next(&a->k_i))
- if (bpos_eq(a->k.p, pos)) {
- BUG_ON(nr != bch2_accounting_counters(&a->k));
- acc_u64s(a->v.d, d, nr);
-
- if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) {
- unsigned offset = (u64 *) a -
- (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
-
- trans->accounting.u64s -= a->k.u64s;
- memmove_u64s_down(a,
- bkey_next(&a->k_i),
- trans->accounting.u64s - offset);
- }
- return 0;
- }
-#endif
- unsigned u64s = sizeof(*a) / sizeof(u64) + nr;
- a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s);
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- return ret;
-
- __accounting_key_init(&a->k_i, pos, d, nr);
- return 0;
- } else {
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
- __accounting_key_init(&k_i.k, pos, d, nr);
-
- int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
- if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
- ret = drop_locks_do(trans,
- bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
- bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
- return ret;
- }
-}
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
- unsigned dev, s64 sectors,
- bool gc)
-{
- struct disk_accounting_pos acc;
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- bch2_replicas_entry_cached(&acc.replicas, dev);
-
- return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
-}
-
-static inline bool is_zero(char *start, char *end)
-{
- BUG_ON(start > end);
-
- for (; start < end; start++)
- if (*start)
- return false;
- return true;
-}
-
-#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
-
-static const unsigned bch2_accounting_type_nr_counters[] = {
-#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr,
- BCH_DISK_ACCOUNTING_TYPES()
-#undef x
-};
-
-int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
- void *end = &acc_k + 1;
- int ret = 0;
-
- bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
- bversion_zero(k.k->bversion),
- c, accounting_key_version_0,
- "accounting key with version=0");
-
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_nr_inodes:
- end = field_end(acc_k, nr_inodes);
- break;
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- end = field_end(acc_k, persistent_reserved);
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- bkey_fsck_err_on(!acc_k.replicas.nr_devs,
- c, accounting_key_replicas_nr_devs_0,
- "accounting key replicas entry with nr_devs=0");
-
- bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
- (acc_k.replicas.nr_required > 1 &&
- acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
- c, accounting_key_replicas_nr_required_bad,
- "accounting key replicas entry with bad nr_required");
-
- for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
- bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
- c, accounting_key_replicas_devs_unsorted,
- "accounting key replicas entry with unsorted devs");
-
- end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- end = field_end(acc_k, dev_data_type);
- break;
- case BCH_DISK_ACCOUNTING_compression:
- end = field_end(acc_k, compression);
- break;
- case BCH_DISK_ACCOUNTING_snapshot:
- end = field_end(acc_k, snapshot);
- break;
- case BCH_DISK_ACCOUNTING_btree:
- end = field_end(acc_k, btree);
- break;
- case BCH_DISK_ACCOUNTING_rebalance_work:
- end = field_end(acc_k, rebalance_work);
- break;
- }
-
- bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
- c, accounting_key_junk_at_end,
- "junk at end of accounting key");
-
- bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
- c, accounting_key_nr_counters_wrong,
- "accounting key with %u counters, should be %u",
- bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
-fsck_err:
- return ret;
-}
-
-void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
-{
- if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
- prt_printf(out, "unknown type %u", k->type);
- return;
- }
-
- prt_str(out, disk_accounting_type_strs[k->type]);
- prt_str(out, " ");
-
- switch (k->type) {
- case BCH_DISK_ACCOUNTING_nr_inodes:
- break;
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- bch2_replicas_entry_to_text(out, &k->replicas);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
- bch2_prt_data_type(out, k->dev_data_type.data_type);
- break;
- case BCH_DISK_ACCOUNTING_compression:
- bch2_prt_compression_type(out, k->compression.type);
- break;
- case BCH_DISK_ACCOUNTING_snapshot:
- prt_printf(out, "id=%u", k->snapshot.id);
- break;
- case BCH_DISK_ACCOUNTING_btree:
- prt_str(out, "btree=");
- bch2_btree_id_to_text(out, k->btree.id);
- break;
- }
-}
-
-void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- bch2_accounting_key_to_text(out, &acc_k);
-
- for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
- prt_printf(out, " %lli", acc.v->d[i]);
-}
-
-void bch2_accounting_swab(struct bkey_s k)
-{
- for (u64 *p = (u64 *) k.v;
- p < (u64 *) bkey_val_end(k);
- p++)
- *p = swab64(*p);
-}
-
-static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
- struct disk_accounting_pos *acc)
-{
- unsafe_memcpy(r, &acc->replicas,
- replicas_entry_bytes(&acc->replicas),
- "variable length struct");
-}
-
-static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
-{
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, p);
-
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_replicas:
- __accounting_to_replicas(r, &acc_k);
- return true;
- default:
- return false;
- }
-}
-
-static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
-{
- union bch_replicas_padded r;
- return accounting_to_replicas(&r.e, p)
- ? bch2_mark_replicas(c, &r.e)
- : 0;
-}
-
-/*
- * Ensure accounting keys being updated are present in the superblock, when
- * applicable (i.e. replicas updates)
- */
-int bch2_accounting_update_sb(struct btree_trans *trans)
-{
- for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
- i != btree_trans_subbuf_top(trans, &trans->accounting);
- i = bkey_next(i)) {
- int ret = bch2_accounting_update_sb_one(trans->c, i->k.p);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- /* raced with another insert, already present: */
- if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &a.k->p) < acc->k.nr)
- return 0;
-
- struct accounting_mem_entry n = {
- .pos = a.k->p,
- .bversion = a.k->bversion,
- .nr_counters = bch2_accounting_counters(a.k),
- .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
- sizeof(u64), GFP_KERNEL),
- };
-
- if (!n.v[0])
- goto err;
-
- if (acc->gc_running) {
- n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!n.v[1])
- goto err;
- }
-
- if (darray_push(&acc->k, n))
- goto err;
-
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
-
- if (trace_accounting_mem_insert_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_accounting_to_text(&buf, c, a.s_c);
- trace_accounting_mem_insert(c, buf.buf);
- printbuf_exit(&buf);
- }
- return 0;
-err:
- free_percpu(n.v[1]);
- free_percpu(n.v[0]);
- return bch_err_throw(c, ENOMEM_disk_accounting);
-}
-
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode)
-{
- union bch_replicas_padded r;
-
- if (mode != BCH_ACCOUNTING_read &&
- accounting_to_replicas(&r.e, a.k->p) &&
- !bch2_replicas_marked_locked(c, &r.e))
- return bch_err_throw(c, btree_insert_need_mark_replicas);
-
- percpu_up_read(&c->mark_lock);
- percpu_down_write(&c->mark_lock);
- int ret = __bch2_accounting_mem_insert(c, a);
- percpu_up_write(&c->mark_lock);
- percpu_down_read(&c->mark_lock);
- return ret;
-}
-
-int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode)
-{
- union bch_replicas_padded r;
-
- if (mode != BCH_ACCOUNTING_read &&
- accounting_to_replicas(&r.e, a.k->p) &&
- !bch2_replicas_marked_locked(c, &r.e))
- return bch_err_throw(c, btree_insert_need_mark_replicas);
-
- return __bch2_accounting_mem_insert(c, a);
-}
-
-static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
-{
- for (unsigned i = 0; i < e->nr_counters; i++)
- if (percpu_u64_get(e->v[0] + i) ||
- (e->v[1] &&
- percpu_u64_get(e->v[1] + i)))
- return false;
- return true;
-}
-
-void bch2_accounting_mem_gc(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- percpu_down_write(&c->mark_lock);
- struct accounting_mem_entry *dst = acc->k.data;
-
- darray_for_each(acc->k, src) {
- if (accounting_mem_entry_is_zero(src)) {
- free_percpu(src->v[0]);
- free_percpu(src->v[1]);
- } else {
- *dst++ = *src;
- }
- }
-
- acc->k.nr = dst - acc->k.data;
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
- percpu_up_write(&c->mark_lock);
-}
-
-/*
- * Read out accounting keys for replicas entries, as an array of
- * bch_replicas_usage entries.
- *
- * Note: this may be deprecated/removed at smoe point in the future and replaced
- * with something more general, it exists to support the ioctl used by the
- * 'bcachefs fs usage' command.
- */
-int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- int ret = 0;
-
- darray_init(usage);
-
- percpu_down_read(&c->mark_lock);
- darray_for_each(acc->k, i) {
- union {
- u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
- BCH_BKEY_PTRS_MAX)];
- struct bch_replicas_usage r;
- } u;
- u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;
-
- if (!accounting_to_replicas(&u.r.r, i->pos))
- continue;
-
- u64 sectors;
- bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
- u.r.sectors = sectors;
-
- ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
- if (ret)
- break;
-
- memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
- usage->nr += replicas_usage_bytes(&u.r);
- }
- percpu_up_read(&c->mark_lock);
-
- if (ret)
- darray_exit(usage);
- return ret;
-}
-
-int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
-{
-
- struct bch_accounting_mem *acc = &c->accounting;
- int ret = 0;
-
- darray_init(out_buf);
-
- percpu_down_read(&c->mark_lock);
- darray_for_each(acc->k, i) {
- struct disk_accounting_pos a_p;
- bpos_to_disk_accounting_pos(&a_p, i->pos);
-
- if (!(accounting_types_mask & BIT(a_p.type)))
- continue;
-
- ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
- sizeof(u64) * i->nr_counters);
- if (ret)
- break;
-
- struct bkey_i_accounting *a_out =
- bkey_accounting_init((void *) &darray_top(*out_buf));
- set_bkey_val_u64s(&a_out->k, i->nr_counters);
- a_out->k.p = i->pos;
- bch2_accounting_mem_read_counters(acc, i - acc->k.data,
- a_out->v.d, i->nr_counters, false);
-
- if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
- out_buf->nr += bkey_bytes(&a_out->k);
- }
-
- percpu_up_read(&c->mark_lock);
-
- if (ret)
- darray_exit(out_buf);
- return ret;
-}
-
-static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
-{
- darray_for_each(acc->k, e) {
- free_percpu(e->v[gc]);
- e->v[gc] = NULL;
- }
-}
-
-int bch2_gc_accounting_start(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
- darray_for_each(acc->k, e) {
- e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!e->v[1]) {
- bch2_accounting_free_counters(acc, true);
- ret = bch_err_throw(c, ENOMEM_disk_accounting);
- break;
- }
- }
-
- acc->gc_running = !ret;
- percpu_up_write(&c->mark_lock);
-
- return ret;
-}
-
-int bch2_gc_accounting_done(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
- struct bpos pos = POS_MIN;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
- while (1) {
- unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &pos);
-
- if (idx >= acc->k.nr)
- break;
-
- struct accounting_mem_entry *e = acc->k.data + idx;
- pos = bpos_successor(e->pos);
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, e->pos);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- continue;
-
- u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
- u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
-
- unsigned nr = e->nr_counters;
- bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
- bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
-
- if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
- printbuf_reset(&buf);
- prt_str(&buf, "accounting mismatch for ");
- bch2_accounting_key_to_text(&buf, &acc_k);
-
- prt_str(&buf, ":\n got");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", dst_v[j]);
-
- prt_str(&buf, "\nshould be");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", src_v[j]);
-
- for (unsigned j = 0; j < nr; j++)
- src_v[j] -= dst_v[j];
-
- bch2_trans_unlock_long(trans);
-
- if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
- percpu_up_write(&c->mark_lock);
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
- percpu_down_write(&c->mark_lock);
- if (ret)
- goto err;
-
- if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
- memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
-
- accounting_key_init(&k_i.k, &acc_k, src_v, nr);
- bch2_accounting_mem_mod_locked(trans,
- bkey_i_to_s_c_accounting(&k_i.k),
- BCH_ACCOUNTING_normal, true);
-
- preempt_disable();
- struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
- struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
- preempt_enable();
- }
- }
- }
- }
-err:
-fsck_err:
- percpu_up_write(&c->mark_lock);
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
-
- if (k.k->type != KEY_TYPE_accounting)
- return 0;
-
- percpu_down_read(&c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
- BCH_ACCOUNTING_read, false);
- percpu_up_read(&c->mark_lock);
- return ret;
-}
-
-static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
- struct disk_accounting_pos *acc,
- u64 *v, unsigned nr)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0, invalid_dev = -1;
-
- switch (acc->type) {
- case BCH_DISK_ACCOUNTING_replicas: {
- union bch_replicas_padded r;
- __accounting_to_replicas(&r.e, acc);
-
- for (unsigned i = 0; i < r.e.nr_devs; i++)
- if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
- !bch2_dev_exists(c, r.e.devs[i])) {
- invalid_dev = r.e.devs[i];
- goto invalid_device;
- }
-
- /*
- * All replicas entry checks except for invalid device are done
- * in bch2_accounting_validate
- */
- BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
-
- if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n%s",
- (printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, acc),
- buf.buf))) {
- /*
- * We're not RW yet and still single threaded, dropping
- * and retaking lock is ok:
- */
- percpu_up_write(&c->mark_lock);
- ret = bch2_mark_replicas(c, &r.e);
- if (ret)
- goto fsck_err;
- percpu_down_write(&c->mark_lock);
- }
- break;
- }
-
- case BCH_DISK_ACCOUNTING_dev_data_type:
- if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
- invalid_dev = acc->dev_data_type.dev;
- goto invalid_device;
- }
- break;
- }
-
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-invalid_device:
- if (fsck_err(trans, accounting_to_invalid_device,
- "accounting entry points to invalid device %i\n%s",
- invalid_dev,
- (printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, acc),
- buf.buf))) {
- for (unsigned i = 0; i < nr; i++)
- v[i] = -v[i];
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
- -BCH_ERR_remove_disk_accounting_entry;
- } else {
- ret = bch_err_throw(c, remove_disk_accounting_entry);
- }
- goto fsck_err;
-}
-
-/*
- * At startup time, initialize the in memory accounting from the btree (and
- * journal)
- */
-int bch2_accounting_read(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
-
- /*
- * We might run more than once if we rewind to start topology repair or
- * btree node scan - and those might cause us to get different results,
- * so we can't just skip if we've already run.
- *
- * Instead, zero out any accounting we have:
- */
- percpu_down_write(&c->mark_lock);
- darray_for_each(acc->k, e)
- percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
- for_each_member_device(c, ca)
- percpu_memset(ca->usage, 0, sizeof(*ca->usage));
- percpu_memset(c->usage, 0, sizeof(*c->usage));
- percpu_up_write(&c->mark_lock);
-
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
- iter.flags &= ~BTREE_ITER_with_journal;
- int ret = for_each_btree_key_continue(trans, iter,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
- if (k.k->type != KEY_TYPE_accounting)
- continue;
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next;
- memset(&next, 0, sizeof(next));
- next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
-
- accounting_read_key(trans, k);
- }));
- if (ret)
- goto err;
-
- struct journal_keys *keys = &c->journal_keys;
- struct journal_key *dst = keys->data;
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- if (i->k->k.type == KEY_TYPE_accounting) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
-
- if (!bch2_accounting_is_mem(&acc_k))
- continue;
-
- struct bkey_s_c k = bkey_i_to_s_c(i->k);
- unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
- sizeof(acc->k.data[0]),
- accounting_pos_cmp, &k.k->p);
-
- bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
-
- if (applied)
- continue;
-
- if (i + 1 < &darray_top(*keys) &&
- i[1].k->k.type == KEY_TYPE_accounting &&
- !journal_key_cmp(i, i + 1)) {
- WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
-
- i[1].journal_seq = i[0].journal_seq;
-
- bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
- bkey_s_c_to_accounting(k));
- continue;
- }
-
- ret = accounting_read_key(trans, k);
- if (ret)
- goto err;
- }
-
- *dst++ = *i;
- }
- keys->gap = keys->nr = dst - keys->data;
-
- percpu_down_write(&c->mark_lock);
-
- darray_for_each_reverse(acc->k, i) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->pos);
-
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- memset(v, 0, sizeof(v));
-
- for (unsigned j = 0; j < i->nr_counters; j++)
- v[j] = percpu_u64_get(i->v[0] + j);
-
- /*
- * If the entry counters are zeroed, it should be treated as
- * nonexistent - it might point to an invalid device.
- *
- * Remove it, so that if it's re-added it gets re-marked in the
- * superblock:
- */
- ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
- ? -BCH_ERR_remove_disk_accounting_entry
- : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
-
- if (ret == -BCH_ERR_remove_disk_accounting_entry) {
- free_percpu(i->v[0]);
- free_percpu(i->v[1]);
- darray_remove_item(&acc->k, i);
- ret = 0;
- continue;
- }
-
- if (ret)
- goto fsck_err;
- }
-
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
-
- preempt_disable();
- struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
-
- for (unsigned i = 0; i < acc->k.nr; i++) {
- struct disk_accounting_pos k;
- bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
-
- switch (k.type) {
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type: {
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
- if (ca) {
- struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
- percpu_u64_set(&d->buckets, v[0]);
- percpu_u64_set(&d->sectors, v[1]);
- percpu_u64_set(&d->fragmented, v[2]);
-
- if (k.dev_data_type.data_type == BCH_DATA_sb ||
- k.dev_data_type.data_type == BCH_DATA_journal)
- usage->hidden += v[0] * ca->mi.bucket_size;
- }
- break;
- }
- }
- }
- preempt_enable();
-fsck_err:
- percpu_up_write(&c->mark_lock);
-err:
- printbuf_exit(&buf);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
-{
- return bch2_trans_run(c,
- bch2_btree_write_buffer_flush_sync(trans) ?:
- for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
- struct disk_accounting_pos acc;
- bpos_to_disk_accounting_pos(&acc, k.k->p);
-
- acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
- acc.dev_data_type.dev == dev
- ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
- : 0;
- })) ?:
- bch2_btree_write_buffer_flush_sync(trans));
-}
-
-int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
-{
- struct bch_fs *c = ca->fs;
- u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
-
- int ret = bch2_trans_do(c, ({
- bch2_disk_accounting_mod2(trans, gc,
- v, dev_data_type,
- .dev = ca->dev_idx,
- .data_type = BCH_DATA_free) ?:
- (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
- }));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_verify_accounting_clean(struct bch_fs *c)
-{
- bool mismatch = false;
- struct bch_fs_usage_base base = {}, base_inmem = {};
-
- bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_all_snapshots, k, ({
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
- unsigned nr = bch2_accounting_counters(k.k);
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next;
- memset(&next, 0, sizeof(next));
- next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
-
- bch2_accounting_mem_read(c, k.k->p, v, nr);
-
- if (memcmp(a.v->d, v, nr * sizeof(u64))) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, " !=");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", v[j]);
-
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
- mismatch = true;
- }
-
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- {
- guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (!ca)
- continue;
-
- v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
- v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
- v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
- }
-
- if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, " in mem");
- for (unsigned j = 0; j < nr; j++)
- prt_printf(&buf, " %llu", v[j]);
-
- pr_err("dev accounting mismatch: %s", buf.buf);
- printbuf_exit(&buf);
- mismatch = true;
- }
- }
-
- 0;
- })));
-
- acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
-
-#define check(x) \
- if (base.x != base_inmem.x) { \
- pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \
- mismatch = true; \
- }
-
- //check(hidden);
- check(btree);
- check(data);
- check(cached);
- check(reserved);
- check(nr_inodes);
-
- WARN_ON(mismatch);
-}
-
-void bch2_accounting_gc_free(struct bch_fs *c)
-{
- lockdep_assert_held(&c->mark_lock);
-
- struct bch_accounting_mem *acc = &c->accounting;
-
- bch2_accounting_free_counters(acc, true);
- acc->gc_running = false;
-}
-
-void bch2_fs_accounting_exit(struct bch_fs *c)
-{
- struct bch_accounting_mem *acc = &c->accounting;
-
- bch2_accounting_free_counters(acc, false);
- darray_exit(&acc->k);
-}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
deleted file mode 100644
index d61abebf3e0b..000000000000
--- a/fs/bcachefs/disk_accounting.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_H
-#define _BCACHEFS_DISK_ACCOUNTING_H
-
-#include "btree_update.h"
-#include "eytzinger.h"
-#include "sb-members.h"
-
-static inline void bch2_u64s_neg(u64 *v, unsigned nr)
-{
- for (unsigned i = 0; i < nr; i++)
- v[i] = -v[i];
-}
-
-static inline unsigned bch2_accounting_counters(const struct bkey *k)
-{
- return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
-}
-
-static inline void bch2_accounting_neg(struct bkey_s_accounting a)
-{
- bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
-}
-
-static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
-{
- for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
- if (a.v->d[i])
- return false;
- return true;
-}
-
-static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
- struct bkey_s_c_accounting src)
-{
- for (unsigned i = 0;
- i < min(bch2_accounting_counters(&dst->k),
- bch2_accounting_counters(src.k));
- i++)
- dst->v.d[i] += src.v->d[i];
-
- if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
- dst->k.bversion = src.k->bversion;
-}
-
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
- enum bch_data_type data_type,
- s64 sectors)
-{
- switch (data_type) {
- case BCH_DATA_btree:
- fs_usage->btree += sectors;
- break;
- case BCH_DATA_user:
- case BCH_DATA_parity:
- fs_usage->data += sectors;
- break;
- case BCH_DATA_cached:
- fs_usage->cached += sectors;
- break;
- default:
- break;
- }
-}
-
-static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
-{
- BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- acc->_pad = p;
-#else
- memcpy_swab(acc, &p, sizeof(p));
-#endif
-}
-
-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
-{
- struct bpos p;
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- p = acc->_pad;
-#else
- memcpy_swab(&p, acc, sizeof(p));
-#endif
- return p;
-}
-
-int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
- s64 *, unsigned, bool);
-
-#define disk_accounting_key_init(_k, _type, ...) \
-do { \
- memset(&(_k), 0, sizeof(_k)); \
- (_k).type = BCH_DISK_ACCOUNTING_##_type; \
- (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
-} while (0)
-
-#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
-({ \
- struct disk_accounting_pos pos; \
- disk_accounting_key_init(pos, __VA_ARGS__); \
- bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
-})
-
-#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
- bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
-
-int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
-
-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
-void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_accounting_swab(struct bkey_s);
-
-#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
- .key_validate = bch2_accounting_validate, \
- .val_to_text = bch2_accounting_to_text, \
- .swab = bch2_accounting_swab, \
- .min_val_size = 8, \
-})
-
-int bch2_accounting_update_sb(struct btree_trans *);
-
-static inline int accounting_pos_cmp(const void *_l, const void *_r)
-{
- const struct bpos *l = _l, *r = _r;
-
- return bpos_cmp(*l, *r);
-}
-
-enum bch_accounting_mode {
- BCH_ACCOUNTING_normal,
- BCH_ACCOUNTING_gc,
- BCH_ACCOUNTING_read,
-};
-
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-void bch2_accounting_mem_gc(struct bch_fs *);
-
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
-{
- return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR &&
- acc->type != BCH_DISK_ACCOUNTING_inum;
-}
-
-/*
- * Update in memory counters so they match the btree update we're doing; called
- * from transaction commit path
- */
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
- struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode,
- bool write_locked)
-{
- struct bch_fs *c = trans->c;
- struct bch_accounting_mem *acc = &c->accounting;
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, a.k->p);
- bool gc = mode == BCH_ACCOUNTING_gc;
-
- if (gc && !acc->gc_running)
- return 0;
-
- if (!bch2_accounting_is_mem(&acc_k))
- return 0;
-
- if (mode == BCH_ACCOUNTING_normal) {
- switch (acc_k.type) {
- case BCH_DISK_ACCOUNTING_persistent_reserved:
- trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
- break;
- case BCH_DISK_ACCOUNTING_replicas:
- fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
- break;
- case BCH_DISK_ACCOUNTING_dev_data_type: {
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (ca) {
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
- }
- break;
- }
- }
- }
-
- unsigned idx;
-
- while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = 0;
- if (unlikely(write_locked))
- ret = bch2_accounting_mem_insert_locked(c, a, mode);
- else
- ret = bch2_accounting_mem_insert(c, a, mode);
- if (ret)
- return ret;
- }
-
- struct accounting_mem_entry *e = &acc->k.data[idx];
-
- EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
-
- for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
- this_cpu_add(e->v[gc][i], a.v->d[i]);
- return 0;
-}
-
-static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
-{
- percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
- percpu_up_read(&trans->c->mark_lock);
- return ret;
-}
-
-static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
- unsigned idx, u64 *v, unsigned nr, bool gc)
-{
- memset(v, 0, sizeof(*v) * nr);
-
- if (unlikely(idx >= acc->k.nr))
- return;
-
- struct accounting_mem_entry *e = &acc->k.data[idx];
-
- nr = min_t(unsigned, nr, e->nr_counters);
-
- for (unsigned i = 0; i < nr; i++)
- v[i] = percpu_u64_get(e->v[gc] + i);
-}
-
-static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
- u64 *v, unsigned nr)
-{
- percpu_down_read(&c->mark_lock);
- struct bch_accounting_mem *acc = &c->accounting;
- unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &p);
-
- bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
- percpu_up_read(&c->mark_lock);
-}
-
-static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
-{
- EBUG_ON(!res->ref);
-
- return (struct bversion) {
- .hi = res->seq >> 32,
- .lo = (res->seq << 32) | (res->offset + offset),
- };
-}
-
-static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
- struct bkey_i_accounting *a,
- unsigned commit_flags)
-{
- u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
- a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base);
-
- EBUG_ON(bversion_zero(a->k.bversion));
-
- return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
- ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
- : 0;
-}
-
-static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
- struct bkey_i_accounting *a_i,
- unsigned commit_flags)
-{
- if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
- struct bkey_s_accounting a = accounting_i_to_s(a_i);
-
- bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
- bch2_accounting_neg(a);
- }
-}
-
-int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
-int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
-
-int bch2_gc_accounting_start(struct bch_fs *);
-int bch2_gc_accounting_done(struct bch_fs *);
-
-int bch2_accounting_read(struct bch_fs *);
-
-int bch2_dev_usage_remove(struct bch_fs *, unsigned);
-int bch2_dev_usage_init(struct bch_dev *, bool);
-
-void bch2_verify_accounting_clean(struct bch_fs *c);
-
-void bch2_accounting_gc_free(struct bch_fs *);
-void bch2_fs_accounting_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
deleted file mode 100644
index 8269af1dbe2a..000000000000
--- a/fs/bcachefs/disk_accounting_format.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
-
-#include "replicas_format.h"
-
-/*
- * Disk accounting - KEY_TYPE_accounting - on disk format:
- *
- * Here, the key has considerably more structure than a typical key (bpos); an
- * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
- *
- * More specifically: a key is just a muliword integer (where word endianness
- * matches native byte order), so we're treating bpos as an opaque 20 byte
- * integer and mapping bch_accounting_key to that.
- *
- * This is a type-tagged union of all our various subtypes; a disk accounting
- * key can be device counters, replicas counters, et cetera - it's extensible.
- *
- * The value is a list of u64s or s64s; the number of counters is specific to a
- * given accounting type.
- *
- * Unlike with other key types, updates are _deltas_, and the deltas are not
- * resolved until the update to the underlying btree, done by btree write buffer
- * flush or journal replay.
- *
- * Journal replay in particular requires special handling. The journal tracks a
- * range of entries which may possibly have not yet been applied to the btree
- * yet - it does not know definitively whether individual entries are dirty and
- * still need to be applied.
- *
- * To handle this, we use the version field of struct bkey, and give every
- * accounting update a unique version number - a total ordering in time; the
- * version number is derived from the key's position in the journal. Then
- * journal replay can compare the version number of the key from the journal
- * with the version number of the key in the btree to determine if a key needs
- * to be replayed.
- *
- * For this to work, we must maintain this strict time ordering of updates as
- * they are flushed to the btree, both via write buffer flush and via journal
- * replay. This has complications for the write buffer code while journal replay
- * is still in progress; the write buffer cannot flush any accounting keys to
- * the btree until journal replay has finished replaying its accounting keys, or
- * the (newer) version number of the keys from the write buffer will cause
- * updates from journal replay to be lost.
- */
-
-struct bch_accounting {
- struct bch_val v;
- __u64 d[];
-};
-
-#define BCH_ACCOUNTING_MAX_COUNTERS 3
-
-#define BCH_DATA_TYPES() \
- x(free, 0) \
- x(sb, 1) \
- x(journal, 2) \
- x(btree, 3) \
- x(user, 4) \
- x(cached, 5) \
- x(parity, 6) \
- x(stripe, 7) \
- x(need_gc_gens, 8) \
- x(need_discard, 9) \
- x(unstriped, 10)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
- BCH_DATA_TYPES()
-#undef x
- BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- return true;
- default:
- return false;
- }
-}
-
-/*
- * field 1: name
- * field 2: id
- * field 3: number of counters (max 3)
- */
-
-#define BCH_DISK_ACCOUNTING_TYPES() \
- x(nr_inodes, 0, 1) \
- x(persistent_reserved, 1, 1) \
- x(replicas, 2, 1) \
- x(dev_data_type, 3, 3) \
- x(compression, 4, 3) \
- x(snapshot, 5, 1) \
- x(btree, 6, 1) \
- x(rebalance_work, 7, 1) \
- x(inum, 8, 3)
-
-enum disk_accounting_type {
-#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
- BCH_DISK_ACCOUNTING_TYPES()
-#undef x
- BCH_DISK_ACCOUNTING_TYPE_NR,
-};
-
-/*
- * No subtypes - number of inodes in the entire filesystem
- *
- * XXX: perhaps we could add a per-subvolume counter?
- */
-struct bch_acct_nr_inodes {
-};
-
-/*
- * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
- * reservation:
- */
-struct bch_acct_persistent_reserved {
- __u8 nr_replicas;
-};
-
-/*
- * device, data type counter fields:
- * [
- * nr_buckets
- * live sectors (in buckets of that data type)
- * sectors of internal fragmentation
- * ]
- *
- * XXX: live sectors should've been done differently, you can have multiple data
- * types in the same bucket (user, stripe, cached) and this collapses them to
- * the bucket data type, and makes the internal fragmentation counter redundant
- */
-struct bch_acct_dev_data_type {
- __u8 dev;
- __u8 data_type;
-};
-
-/*
- * Compression type fields:
- * [
- * number of extents
- * uncompressed size
- * compressed size
- * ]
- *
- * Compression ratio, average extent size (fragmentation).
- */
-struct bch_acct_compression {
- __u8 type;
-};
-
-/*
- * On disk usage by snapshot id; counts same values as replicas counter, but
- * aggregated differently
- */
-struct bch_acct_snapshot {
- __u32 id;
-} __packed;
-
-struct bch_acct_btree {
- __u32 id;
-} __packed;
-
-/*
- * inum counter fields:
- * [
- * number of extents
- * sum of extent sizes - bkey size
- * this field is similar to inode.bi_sectors, except here extents in
- * different snapshots but the same inode number are all collapsed to the
- * same counter
- * sum of on disk size - same values tracked by replicas counters
- * ]
- *
- * This tracks on disk fragmentation.
- */
-struct bch_acct_inum {
- __u64 inum;
-} __packed;
-
-/*
- * Simple counter of the amount of data (on disk sectors) rebalance needs to
- * move, extents counted here are also in the rebalance_work btree.
- */
-struct bch_acct_rebalance_work {
-};
-
-struct disk_accounting_pos {
- union {
- struct {
- __u8 type;
- union {
- struct bch_acct_nr_inodes nr_inodes;
- struct bch_acct_persistent_reserved persistent_reserved;
- struct bch_replicas_entry_v1 replicas;
- struct bch_acct_dev_data_type dev_data_type;
- struct bch_acct_compression compression;
- struct bch_acct_snapshot snapshot;
- struct bch_acct_btree btree;
- struct bch_acct_rebalance_work rebalance_work;
- struct bch_acct_inum inum;
- } __packed;
- } __packed;
- struct bpos _pad;
- };
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
deleted file mode 100644
index b1982131b206..000000000000
--- a/fs/bcachefs/disk_accounting_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
-
-#include "darray.h"
-
-struct accounting_mem_entry {
- struct bpos pos;
- struct bversion bversion;
- unsigned nr_counters;
- u64 __percpu *v[2];
-};
-
-struct bch_accounting_mem {
- DARRAY(struct accounting_mem_entry) k;
- bool gc_running;
-};
-
-#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
deleted file mode 100644
index cde842ac1886..000000000000
--- a/fs/bcachefs/disk_groups.c
+++ /dev/null
@@ -1,591 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "disk_groups.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int group_cmp(const void *_l, const void *_r)
-{
- const struct bch_disk_group *l = _l;
- const struct bch_disk_group *r = _r;
-
- return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
- (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
- ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
- (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
- strncmp(l->label, r->label, sizeof(l->label));
-}
-
-static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_disk_groups *groups =
- field_to_type(f, disk_groups);
- struct bch_disk_group *g, *sorted = NULL;
- unsigned nr_groups = disk_groups_nr(groups);
- unsigned i, len;
- int ret = 0;
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member m = bch2_sb_member_get(sb, i);
- unsigned group_id;
-
- if (!BCH_MEMBER_GROUP(&m))
- continue;
-
- group_id = BCH_MEMBER_GROUP(&m) - 1;
-
- if (group_id >= nr_groups) {
- prt_printf(err, "disk %u has invalid label %u (have %u)",
- i, group_id, nr_groups);
- return -BCH_ERR_invalid_sb_disk_groups;
- }
-
- if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
- prt_printf(err, "disk %u has deleted label %u", i, group_id);
- return -BCH_ERR_invalid_sb_disk_groups;
- }
- }
-
- if (!nr_groups)
- return 0;
-
- for (i = 0; i < nr_groups; i++) {
- g = groups->entries + i;
-
- if (BCH_GROUP_DELETED(g))
- continue;
-
- len = strnlen(g->label, sizeof(g->label));
- if (!len) {
- prt_printf(err, "label %u empty", i);
- return -BCH_ERR_invalid_sb_disk_groups;
- }
- }
-
- sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
- if (!sorted)
- return -BCH_ERR_ENOMEM_disk_groups_validate;
-
- memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
- sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
-
- for (g = sorted; g + 1 < sorted + nr_groups; g++)
- if (!BCH_GROUP_DELETED(g) &&
- !group_cmp(&g[0], &g[1])) {
- prt_printf(err, "duplicate label %llu.%.*s",
- BCH_GROUP_PARENT(g),
- (int) sizeof(g->label), g->label);
- ret = -BCH_ERR_invalid_sb_disk_groups;
- goto err;
- }
-err:
- kfree(sorted);
- return ret;
-}
-
-static void bch2_sb_disk_groups_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_disk_groups *groups =
- field_to_type(f, disk_groups);
- struct bch_disk_group *g;
- unsigned nr_groups = disk_groups_nr(groups);
-
- for (g = groups->entries;
- g < groups->entries + nr_groups;
- g++) {
- if (g != groups->entries)
- prt_printf(out, " ");
-
- if (BCH_GROUP_DELETED(g))
- prt_printf(out, "[deleted]");
- else
- prt_printf(out, "[parent %llu name %s]",
- BCH_GROUP_PARENT(g), g->label);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
- .validate = bch2_sb_disk_groups_validate,
- .to_text = bch2_sb_disk_groups_to_text
-};
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_disk_groups *groups;
- struct bch_disk_groups_cpu *cpu_g, *old_g;
- unsigned i, g, nr_groups;
-
- lockdep_assert_held(&c->sb_lock);
-
- groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups);
- nr_groups = disk_groups_nr(groups);
-
- if (!groups)
- return 0;
-
- cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
- if (!cpu_g)
- return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
-
- cpu_g->nr = nr_groups;
-
- for (i = 0; i < nr_groups; i++) {
- struct bch_disk_group *src = &groups->entries[i];
- struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
-
- dst->deleted = BCH_GROUP_DELETED(src);
- dst->parent = BCH_GROUP_PARENT(src);
- memcpy(dst->label, src->label, sizeof(dst->label));
- }
-
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
- struct bch_disk_group_cpu *dst;
-
- if (!bch2_member_alive(&m))
- continue;
-
- g = BCH_MEMBER_GROUP(&m);
- while (g) {
- dst = &cpu_g->entries[g - 1];
- __set_bit(i, dst->devs.d);
- g = dst->parent;
- }
- }
-
- old_g = rcu_dereference_protected(c->disk_groups,
- lockdep_is_held(&c->sb_lock));
- rcu_assign_pointer(c->disk_groups, cpu_g);
- if (old_g)
- kfree_rcu(old_g, rcu);
-
- return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
- struct target t = target_decode(target);
-
- guard(rcu)();
-
- switch (t.type) {
- case TARGET_NULL:
- return NULL;
- case TARGET_DEV: {
- struct bch_dev *ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
- return ca ? &ca->self : NULL;
- }
- case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
- return g && t.group < g->nr && !g->entries[t.group].deleted
- ? &g->entries[t.group].devs
- : NULL;
- }
- default:
- BUG();
- }
-}
-
-bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
- struct target t = target_decode(target);
-
- switch (t.type) {
- case TARGET_NULL:
- return false;
- case TARGET_DEV:
- return dev == t.dev;
- case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- const struct bch_devs_mask *m =
- g && t.group < g->nr && !g->entries[t.group].deleted
- ? &g->entries[t.group].devs
- : NULL;
-
- return m ? test_bit(dev, m->d) : false;
- }
- default:
- BUG();
- }
-}
-
-static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
- unsigned parent,
- const char *name, unsigned namelen)
-{
- unsigned i, nr_groups = disk_groups_nr(groups);
-
- if (!namelen || namelen > BCH_SB_LABEL_SIZE)
- return -EINVAL;
-
- for (i = 0; i < nr_groups; i++) {
- struct bch_disk_group *g = groups->entries + i;
-
- if (BCH_GROUP_DELETED(g))
- continue;
-
- if (!BCH_GROUP_DELETED(g) &&
- BCH_GROUP_PARENT(g) == parent &&
- strnlen(g->label, sizeof(g->label)) == namelen &&
- !memcmp(name, g->label, namelen))
- return i;
- }
-
- return -1;
-}
-
-static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
- const char *name, unsigned namelen)
-{
- struct bch_sb_field_disk_groups *groups =
- bch2_sb_field_get(sb->sb, disk_groups);
- unsigned i, nr_groups = disk_groups_nr(groups);
- struct bch_disk_group *g;
-
- if (!namelen || namelen > BCH_SB_LABEL_SIZE)
- return -EINVAL;
-
- for (i = 0;
- i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
- i++)
- ;
-
- if (i == nr_groups) {
- unsigned u64s =
- (sizeof(struct bch_sb_field_disk_groups) +
- sizeof(struct bch_disk_group) * (nr_groups + 1)) /
- sizeof(u64);
-
- groups = bch2_sb_field_resize(sb, disk_groups, u64s);
- if (!groups)
- return -BCH_ERR_ENOSPC_disk_label_add;
-
- nr_groups = disk_groups_nr(groups);
- }
-
- BUG_ON(i >= nr_groups);
-
- g = &groups->entries[i];
-
- memcpy(g->label, name, namelen);
- if (namelen < sizeof(g->label))
- g->label[namelen] = '\0';
- SET_BCH_GROUP_DELETED(g, 0);
- SET_BCH_GROUP_PARENT(g, parent);
- SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-
- return i;
-}
-
-int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
-{
- struct bch_sb_field_disk_groups *groups =
- bch2_sb_field_get(sb->sb, disk_groups);
- int v = -1;
-
- do {
- const char *next = strchrnul(name, '.');
- unsigned len = next - name;
-
- if (*next == '.')
- next++;
-
- v = __bch2_disk_group_find(groups, v + 1, name, len);
- name = next;
- } while (*name && v >= 0);
-
- return v;
-}
-
-int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
-{
- struct bch_sb_field_disk_groups *groups;
- unsigned parent = 0;
- int v = -1;
-
- do {
- const char *next = strchrnul(name, '.');
- unsigned len = next - name;
-
- if (*next == '.')
- next++;
-
- groups = bch2_sb_field_get(sb->sb, disk_groups);
-
- v = __bch2_disk_group_find(groups, parent, name, len);
- if (v < 0)
- v = __bch2_disk_group_add(sb, parent, name, len);
- if (v < 0)
- return v;
-
- parent = v + 1;
- name = next;
- } while (*name && v >= 0);
-
- return v;
-}
-
-static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
- unsigned v)
-{
- u16 path[32];
- unsigned nr = 0;
-
- while (1) {
- if (nr == ARRAY_SIZE(path))
- goto invalid;
-
- if (v >= (g ? g->nr : 0))
- goto invalid;
-
- struct bch_disk_group_cpu *e = g->entries + v;
-
- if (e->deleted)
- goto invalid;
-
- path[nr++] = v;
-
- if (!e->parent)
- break;
-
- v = e->parent - 1;
- }
-
- while (nr) {
- struct bch_disk_group_cpu *e = g->entries + path[--nr];
-
- prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
- if (nr)
- prt_printf(out, ".");
- }
- return;
-invalid:
- prt_printf(out, "invalid label %u", v);
-}
-
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
- bch2_printbuf_make_room(out, 4096);
-
- out->atomic++;
- guard(rcu)();
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
- for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
- prt_printf(out, "%2u: ", i);
-
- if (g->entries[i].deleted) {
- prt_printf(out, "[deleted]");
- goto next;
- }
-
- __bch2_disk_path_to_text(out, g, i);
-
- prt_printf(out, " devs");
-
- for_each_member_device_rcu(c, ca, &g->entries[i].devs)
- prt_printf(out, " %s", ca->name);
-next:
- prt_newline(out);
- }
-
- out->atomic--;
-}
-
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
- out->atomic++;
- guard(rcu)();
- __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
- --out->atomic;
-}
-
-void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
- struct bch_sb_field_disk_groups *groups =
- bch2_sb_field_get(sb, disk_groups);
- struct bch_disk_group *g;
- unsigned nr = 0;
- u16 path[32];
-
- while (1) {
- if (nr == ARRAY_SIZE(path))
- goto inval;
-
- if (v >= disk_groups_nr(groups))
- goto inval;
-
- g = groups->entries + v;
-
- if (BCH_GROUP_DELETED(g))
- goto inval;
-
- path[nr++] = v;
-
- if (!BCH_GROUP_PARENT(g))
- break;
-
- v = BCH_GROUP_PARENT(g) - 1;
- }
-
- while (nr) {
- v = path[--nr];
- g = groups->entries + v;
-
- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
- if (nr)
- prt_printf(out, ".");
- }
- return;
-inval:
- prt_printf(out, "invalid label %u", v);
-}
-
-int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
- lockdep_assert_held(&c->sb_lock);
-
-
- if (!strlen(name) || !strcmp(name, "none")) {
- struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_GROUP(mi, 0);
- } else {
- int v = bch2_disk_path_find_or_create(&c->disk_sb, name);
- if (v < 0)
- return v;
-
- struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_GROUP(mi, v + 1);
- }
-
- return bch2_sb_disk_groups_to_cpu(c);
-}
-
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
-{
- int ret;
-
- mutex_lock(&c->sb_lock);
- ret = __bch2_dev_group_set(c, ca, name) ?:
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
- struct printbuf *err)
-{
- struct bch_dev *ca;
- int g;
-
- if (!val)
- return -EINVAL;
-
- if (!c)
- return -BCH_ERR_option_needs_open_fs;
-
- if (!strlen(val) || !strcmp(val, "none")) {
- *res = 0;
- return 0;
- }
-
- /* Is it a device? */
- ca = bch2_dev_lookup(c, val);
- if (!IS_ERR(ca)) {
- *res = dev_to_target(ca->dev_idx);
- bch2_dev_put(ca);
- return 0;
- }
-
- mutex_lock(&c->sb_lock);
- g = bch2_disk_path_find(&c->disk_sb, val);
- mutex_unlock(&c->sb_lock);
-
- if (g >= 0) {
- *res = group_to_target(g);
- return 0;
- }
-
- return -EINVAL;
-}
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
-{
- struct target t = target_decode(v);
-
- switch (t.type) {
- case TARGET_NULL:
- prt_printf(out, "none");
- return;
- case TARGET_DEV: {
- out->atomic++;
- guard(rcu)();
- struct bch_dev *ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && ca->disk_sb.bdev)
- prt_printf(out, "/dev/%s", ca->name);
- else if (ca)
- prt_printf(out, "offline device %u", t.dev);
- else
- prt_printf(out, "invalid device %u", t.dev);
-
- out->atomic--;
- return;
- }
- case TARGET_GROUP:
- bch2_disk_path_to_text(out, c, t.group);
- return;
- default:
- BUG();
- }
-}
-
-static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
-{
- struct target t = target_decode(v);
-
- switch (t.type) {
- case TARGET_NULL:
- prt_printf(out, "none");
- break;
- case TARGET_DEV: {
- struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
- if (bch2_member_exists(sb, t.dev)) {
- prt_printf(out, "Device ");
- pr_uuid(out, m.uuid.b);
- prt_printf(out, " (%u)", t.dev);
- } else {
- prt_printf(out, "Bad device %u", t.dev);
- }
- break;
- }
- case TARGET_GROUP:
- bch2_disk_path_to_text_sb(out, sb, t.group);
- break;
- default:
- BUG();
- }
-}
-
-void bch2_opt_target_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
-{
- if (c)
- bch2_target_to_text(out, c, v);
- else
- bch2_target_to_text_sb(out, sb, v);
-}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
deleted file mode 100644
index 441826fff224..000000000000
--- a/fs/bcachefs/disk_groups.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_H
-#define _BCACHEFS_DISK_GROUPS_H
-
-#include "disk_groups_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
- return groups
- ? (vstruct_end(&groups->field) -
- (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
- : 0;
-}
-
-struct target {
- enum {
- TARGET_NULL,
- TARGET_DEV,
- TARGET_GROUP,
- } type;
- union {
- unsigned dev;
- unsigned group;
- };
-};
-
-#define TARGET_DEV_START 1
-#define TARGET_GROUP_START (256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
- return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
- return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
- if (target >= TARGET_GROUP_START)
- return (struct target) {
- .type = TARGET_GROUP,
- .group = target - TARGET_GROUP_START
- };
-
- if (target >= TARGET_DEV_START)
- return (struct target) {
- .type = TARGET_DEV,
- .group = target - TARGET_DEV_START
- };
-
- return (struct target) { .type = TARGET_NULL };
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
- enum bch_data_type data_type,
- u16 target)
-{
- struct bch_devs_mask devs = c->rw_devs[data_type];
- const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
-
- if (t)
- bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
- return devs;
-}
-
-static inline bool bch2_target_accepts_data(struct bch_fs *c,
- enum bch_data_type data_type,
- u16 target)
-{
- struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
- return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
-}
-
-bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
-
-int bch2_disk_path_find(struct bch_sb_handle *, const char *);
-
-/* Exported for userspace bcachefs-tools: */
-int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-
-void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
-void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
-
-void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
-
-#define bch2_opt_target (struct bch_opt_fn) { \
- .parse = bch2_opt_target_parse, \
- .to_text = bch2_opt_target_to_text, \
-}
-
-int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
-
-const char *bch2_sb_validate_disk_groups(struct bch_sb *,
- struct bch_sb_field *);
-
-void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
-
-#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h
deleted file mode 100644
index 698990bbf1d2..000000000000
--- a/fs/bcachefs/disk_groups_format.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
-#define _BCACHEFS_DISK_GROUPS_FORMAT_H
-
-#define BCH_SB_LABEL_SIZE 32
-
-struct bch_disk_group {
- __u8 label[BCH_SB_LABEL_SIZE];
- __le64 flags[2];
-} __packed __aligned(8);
-
-LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
-LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
-
-struct bch_sb_field_disk_groups {
- struct bch_sb_field field;
- struct bch_disk_group entries[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
deleted file mode 100644
index a54ef085b13d..000000000000
--- a/fs/bcachefs/disk_groups_types.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
-#define _BCACHEFS_DISK_GROUPS_TYPES_H
-
-struct bch_disk_group_cpu {
- bool deleted;
- u16 parent;
- u8 label[BCH_SB_LABEL_SIZE];
- struct bch_devs_mask devs;
-};
-
-struct bch_disk_groups_cpu {
- struct rcu_head rcu;
- unsigned nr;
- struct bch_disk_group_cpu entries[] __counted_by(nr);
-};
-
-#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
deleted file mode 100644
index 543dbba9b14f..000000000000
--- a/fs/bcachefs/ec.c
+++ /dev/null
@@ -1,2405 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/* erasure coding */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "keylist.h"
-#include "lru.h"
-#include "recovery.h"
-#include "replicas.h"
-#include "super-io.h"
-#include "util.h"
-
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-#ifdef __KERNEL__
-
-#include <linux/raid/pq.h>
-#include <linux/raid/xor.h>
-
-static void raid5_recov(unsigned disks, unsigned failed_idx,
- size_t size, void **data)
-{
- unsigned i = 2, nr;
-
- BUG_ON(failed_idx >= disks);
-
- swap(data[0], data[failed_idx]);
- memcpy(data[0], data[1], size);
-
- while (i < disks) {
- nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
- xor_blocks(nr, size, data[0], data + i);
- i += nr;
- }
-
- swap(data[0], data[failed_idx]);
-}
-
-static void raid_gen(int nd, int np, size_t size, void **v)
-{
- if (np >= 1)
- raid5_recov(nd + np, nd, size, v);
- if (np >= 2)
- raid6_call.gen_syndrome(nd + np, size, v);
- BUG_ON(np > 2);
-}
-
-static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
-{
- switch (nr) {
- case 0:
- break;
- case 1:
- if (ir[0] < nd + 1)
- raid5_recov(nd + 1, ir[0], size, v);
- else
- raid6_call.gen_syndrome(nd + np, size, v);
- break;
- case 2:
- if (ir[1] < nd) {
- /* data+data failure. */
- raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
- } else if (ir[0] < nd) {
- /* data + p/q failure */
-
- if (ir[1] == nd) /* data + p failure */
- raid6_datap_recov(nd + np, size, ir[0], v);
- else { /* data + q failure */
- raid5_recov(nd + 1, ir[0], size, v);
- raid6_call.gen_syndrome(nd + np, size, v);
- }
- } else {
- raid_gen(nd, np, size, v);
- }
- break;
- default:
- BUG();
- }
-}
-
-#else
-
-#include <raid/raid.h>
-
-#endif
-
-struct ec_bio {
- struct bch_dev *ca;
- struct ec_stripe_buf *buf;
- size_t idx;
- int rw;
- u64 submit_time;
- struct bio bio;
-};
-
-/* Stripes btree keys: */
-
-int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- int ret = 0;
-
- bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
- bpos_gt(k.k->p, POS(0, U32_MAX)),
- c, stripe_pos_bad,
- "stripe at bad pos");
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
- c, stripe_val_size_bad,
- "incorrect value size (%zu < %u)",
- bkey_val_u64s(k.k), stripe_val_u64s(s));
-
- bkey_fsck_err_on(s->csum_granularity_bits >= 64,
- c, stripe_csum_granularity_bad,
- "invalid csum granularity (%u >= 64)",
- s->csum_granularity_bits);
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
- struct bch_stripe s = {};
-
- memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
-
- unsigned nr_data = s.nr_blocks - s.nr_redundant;
-
- prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
- s.algorithm,
- le16_to_cpu(s.sectors),
- nr_data,
- s.nr_redundant);
- bch2_prt_csum_type(out, s.csum_type);
- prt_str(out, " gran ");
- if (s.csum_granularity_bits < 64)
- prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
- else
- prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
-
- if (s.disk_label) {
- prt_str(out, " label");
- bch2_disk_path_to_text(out, c, s.disk_label - 1);
- }
-
- for (unsigned i = 0; i < s.nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = sp->ptrs + i;
-
- if ((void *) ptr >= bkey_val_end(k))
- break;
-
- prt_char(out, ' ');
- bch2_extent_ptr_to_text(out, c, ptr);
-
- if (s.csum_type < BCH_CSUM_NR &&
- i < nr_data &&
- stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
- prt_printf(out, "#%u", stripe_blockcount_get(sp, i));
- }
-}
-
-/* Triggers: */
-
-static int __mark_stripe_bucket(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c_stripe s,
- unsigned ptr_idx, bool deleting,
- struct bpos bucket,
- struct bch_alloc_v4 *a,
- enum btree_iter_update_trigger_flags flags)
-{
- const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
- unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_fs *c = trans->c;
- if (deleting)
- sectors = -sectors;
-
- if (!deleting) {
- if (bch2_trans_inconsistent_on(a->stripe ||
- a->stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
- bucket.inode, bucket.offset, a->gen,
- bch2_data_type_str(a->data_type),
- a->dirty_sectors,
- a->stripe, s.k->p.offset,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
- bucket.inode, bucket.offset, a->gen,
- bch2_data_type_str(a->data_type),
- a->dirty_sectors,
- a->cached_sectors,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
- } else {
- if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
- a->stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
- bucket.inode, bucket.offset, a->gen,
- a->stripe,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
- "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
- bucket.inode, bucket.offset, a->gen,
- bch2_data_type_str(a->data_type),
- bch2_data_type_str(data_type),
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(parity &&
- (a->dirty_sectors != -sectors ||
- a->cached_sectors), trans,
- "bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
- bucket.inode, bucket.offset, a->gen,
- a->dirty_sectors,
- a->cached_sectors,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
- }
-
- if (sectors) {
- ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
- a->gen, a->data_type, &a->dirty_sectors);
- if (ret)
- goto err;
- }
-
- if (!deleting) {
- a->stripe = s.k->p.offset;
- a->stripe_redundancy = s.v->nr_redundant;
- alloc_data_type_set(a, data_type);
- } else {
- a->stripe = 0;
- a->stripe_redundancy = 0;
- alloc_data_type_set(a, BCH_DATA_user);
- }
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned ptr_idx, bool deleting,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
- if (unlikely(!ca)) {
- if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
-
- struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
-
- if (flags & BTREE_TRIGGER_transactional) {
- struct extent_ptr_decoded p = {
- .ptr = *ptr,
- .crc = bch2_extent_crc_unpack(s.k, NULL),
- };
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
- (const union bch_extent_entry *) ptr, &bp);
-
- struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
- bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
- !(flags & BTREE_TRIGGER_overwrite));
- if (ret)
- goto err;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- struct bucket *g = gc_bucket(ca, bucket.offset);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
- ptr->dev,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = bch_err_throw(c, mark_stripe);
- goto err;
- }
-
- bucket_lock(g);
- struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
- alloc_to_bucket(g, new);
- bucket_unlock(g);
-
- if (!ret)
- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
- }
-err:
- bch2_dev_put(ca);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int mark_stripe_buckets(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
-
- BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
-
- unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
- for (unsigned i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- int ret = mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(new), i, false, flags);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- int ret = mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true, flags);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_trigger_stripe(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s _new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_s_c new = _new.s_c;
- struct bch_fs *c = trans->c;
- u64 idx = new.k->p.offset;
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
-
- if (unlikely(flags & BTREE_TRIGGER_check_repair))
- return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
-
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
-
- if (flags & BTREE_TRIGGER_transactional) {
- int ret = bch2_lru_change(trans,
- BCH_LRU_STRIPE_FRAGMENTATION,
- idx,
- stripe_lru_pos(old_s),
- stripe_lru_pos(new_s));
- if (ret)
- return ret;
- }
-
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- /*
- * If the pointers aren't changing, we don't need to do anything:
- */
- if (new_s && old_s &&
- new_s->nr_blocks == old_s->nr_blocks &&
- new_s->nr_redundant == old_s->nr_redundant &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
- return 0;
-
- struct gc_stripe *gc = NULL;
- if (flags & BTREE_TRIGGER_gc) {
- gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
- if (!gc) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
- return bch_err_throw(c, ENOMEM_mark_stripe);
- }
-
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- *
- * Also: when we bring back runtime gc, locking
- */
- gc->alive = true;
- gc->sectors = le16_to_cpu(new_s->sectors);
- gc->nr_blocks = new_s->nr_blocks;
- gc->nr_redundant = new_s->nr_redundant;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- gc->ptrs[i] = new_s->ptrs[i];
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
- }
-
- if (new_s) {
- s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
-
- struct disk_accounting_pos acc;
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- bch2_bkey_to_replicas(&acc.replicas, new);
- int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
- if (ret)
- return ret;
-
- if (gc)
- unsafe_memcpy(&gc->r.e, &acc.replicas,
- replicas_entry_bytes(&acc.replicas), "VLA");
- }
-
- if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
-
- struct disk_accounting_pos acc;
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- bch2_bkey_to_replicas(&acc.replicas, old);
- int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
- if (ret)
- return ret;
- }
-
- int ret = mark_stripe_buckets(trans, old, new, flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* returns blocknr in stripe that we matched: */
-static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
- struct bkey_s_c k, unsigned *block)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
-
- bkey_for_each_ptr(ptrs, ptr)
- for (i = 0; i < nr_data; i++)
- if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
- le16_to_cpu(s->sectors))) {
- *block = i;
- return ptr;
- }
-
- return NULL;
-}
-
-static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
-
- bkey_extent_entry_for_each(ptrs, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
-
- return false;
-}
-
-/* Stripe bufs: */
-
-static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
-{
- if (buf->key.k.type == KEY_TYPE_stripe) {
- struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
- unsigned i;
-
- for (i = 0; i < s->v.nr_blocks; i++) {
- kvfree(buf->data[i]);
- buf->data[i] = NULL;
- }
- }
-}
-
-/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct bch_fs *c,
- struct ec_stripe_buf *buf,
- unsigned offset, unsigned size)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned csum_granularity = 1U << v->csum_granularity_bits;
- unsigned end = offset + size;
- unsigned i;
-
- BUG_ON(end > le16_to_cpu(v->sectors));
-
- offset = round_down(offset, csum_granularity);
- end = min_t(unsigned, le16_to_cpu(v->sectors),
- round_up(end, csum_granularity));
-
- buf->offset = offset;
- buf->size = end - offset;
-
- memset(buf->valid, 0xFF, sizeof(buf->valid));
-
- for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
- if (!buf->data[i])
- goto err;
- }
-
- return 0;
-err:
- ec_stripe_buf_exit(buf);
- return bch_err_throw(c, ENOMEM_stripe_buf);
-}
-
-/* Checksumming: */
-
-static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
- unsigned block, unsigned offset)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned end = buf->offset + buf->size;
- unsigned len = min(csum_granularity, end - offset);
-
- BUG_ON(offset >= end);
- BUG_ON(offset < buf->offset);
- BUG_ON(offset & (csum_granularity - 1));
- BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
- (len & (csum_granularity - 1)));
-
- return bch2_checksum(NULL, v->csum_type,
- null_nonce(),
- buf->data[block] + ((offset - buf->offset) << 9),
- len << 9);
-}
-
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned i, j, csums_per_device = stripe_csums_per_device(v);
-
- if (!v->csum_type)
- return;
-
- BUG_ON(buf->offset);
- BUG_ON(buf->size != le16_to_cpu(v->sectors));
-
- for (i = 0; i < v->nr_blocks; i++)
- for (j = 0; j < csums_per_device; j++)
- stripe_csum_set(v, i, j,
- ec_block_checksum(buf, i, j << v->csum_granularity_bits));
-}
-
-static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned i;
-
- if (!v->csum_type)
- return;
-
- for (i = 0; i < v->nr_blocks; i++) {
- unsigned offset = buf->offset;
- unsigned end = buf->offset + buf->size;
-
- if (!test_bit(i, buf->valid))
- continue;
-
- while (offset < end) {
- unsigned j = offset >> v->csum_granularity_bits;
- unsigned len = min(csum_granularity, end - offset);
- struct bch_csum want = stripe_csum_get(v, i, j);
- struct bch_csum got = ec_block_checksum(buf, i, offset);
-
- if (bch2_crc_cmp(want, got)) {
- struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
- if (ca) {
- struct printbuf err = PRINTBUF;
-
- prt_str(&err, "stripe ");
- bch2_csum_err_msg(&err, v->csum_type, want, got);
- prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
- bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
- bch_err_ratelimited(ca, "%s", err.buf);
- printbuf_exit(&err);
-
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
- }
-
- clear_bit(i, buf->valid);
- break;
- }
-
- offset += len;
- }
- }
-}
-
-/* Erasure coding: */
-
-static void ec_generate_ec(struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned nr_data = v->nr_blocks - v->nr_redundant;
- unsigned bytes = le16_to_cpu(v->sectors) << 9;
-
- raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
-}
-
-static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
-
- return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
-}
-
-static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
- unsigned nr_data = v->nr_blocks - v->nr_redundant;
- unsigned bytes = buf->size << 9;
-
- if (ec_nr_failed(buf) > v->nr_redundant) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: unable to read enough blocks");
- return -1;
- }
-
- for (i = 0; i < nr_data; i++)
- if (!test_bit(i, buf->valid))
- failed[nr_failed++] = i;
-
- raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
- return 0;
-}
-
-/* IO: */
-
-static void ec_block_endio(struct bio *bio)
-{
- struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
- struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
- struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
- struct bch_dev *ca = ec_bio->ca;
- struct closure *cl = bio->bi_private;
- int rw = ec_bio->rw;
- unsigned ref = rw == READ
- ? BCH_DEV_READ_REF_ec_block
- : BCH_DEV_WRITE_REF_ec_block;
-
- bch2_account_io_completion(ca, bio_data_dir(bio),
- ec_bio->submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
- str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status));
- clear_bit(ec_bio->idx, ec_bio->buf->valid);
- }
-
- int stale = dev_ptr_stale(ca, ptr);
- if (stale) {
- bch_err_ratelimited(ca->fs,
- "error %s stripe: stale/invalid pointer (%i) after io",
- bio_data_dir(bio) == READ ? "reading from" : "writing to",
- stale);
- clear_bit(ec_bio->idx, ec_bio->buf->valid);
- }
-
- bio_put(&ec_bio->bio);
- enumerated_ref_put(&ca->io_ref[rw], ref);
- closure_put(cl);
-}
-
-static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
- blk_opf_t opf, unsigned idx, struct closure *cl)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
- unsigned offset = 0, bytes = buf->size << 9;
- struct bch_extent_ptr *ptr = &v->ptrs[idx];
- enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
- ? BCH_DATA_user
- : BCH_DATA_parity;
- int rw = op_is_write(opf);
- unsigned ref = rw == READ
- ? BCH_DEV_READ_REF_ec_block
- : BCH_DEV_WRITE_REF_ec_block;
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
- if (!ca) {
- clear_bit(idx, buf->valid);
- return;
- }
-
- int stale = dev_ptr_stale(ca, ptr);
- if (stale) {
- bch_err_ratelimited(c,
- "error %s stripe: stale pointer (%i)",
- rw == READ ? "reading from" : "writing to",
- stale);
- clear_bit(idx, buf->valid);
- return;
- }
-
-
- this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
-
- while (offset < bytes) {
- unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
- DIV_ROUND_UP(bytes, PAGE_SIZE));
- unsigned b = min_t(size_t, bytes - offset,
- nr_iovecs << PAGE_SHIFT);
- struct ec_bio *ec_bio;
-
- ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
- nr_iovecs,
- opf,
- GFP_KERNEL,
- &c->ec_bioset),
- struct ec_bio, bio);
-
- ec_bio->ca = ca;
- ec_bio->buf = buf;
- ec_bio->idx = idx;
- ec_bio->rw = rw;
- ec_bio->submit_time = local_clock();
-
- ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
- ec_bio->bio.bi_end_io = ec_block_endio;
- ec_bio->bio.bi_private = cl;
-
- bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
-
- closure_get(cl);
- enumerated_ref_get(&ca->io_ref[rw], ref);
-
- submit_bio(&ec_bio->bio);
-
- offset += b;
- }
-
- enumerated_ref_put(&ca->io_ref[rw], ref);
-}
-
-static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
- struct ec_stripe_buf *stripe)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- POS(0, idx), BTREE_ITER_slots);
- ret = bkey_err(k);
- if (ret)
- goto err;
- if (k.k->type != KEY_TYPE_stripe) {
- ret = -ENOENT;
- goto err;
- }
- bkey_reassemble(&stripe->key, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
- struct bkey_s_c orig_k)
-{
- struct bch_fs *c = trans->c;
- struct ec_stripe_buf *buf = NULL;
- struct closure cl;
- struct bch_stripe *v;
- unsigned i, offset;
- const char *msg = NULL;
- struct printbuf msgbuf = PRINTBUF;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- BUG_ON(!rbio->pick.has_ec);
-
- buf = kzalloc(sizeof(*buf), GFP_NOFS);
- if (!buf)
- return bch_err_throw(c, ENOMEM_ec_read_extent);
-
- ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
- if (ret) {
- msg = "stripe not found";
- goto err;
- }
-
- v = &bkey_i_to_stripe(&buf->key)->v;
-
- if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
- msg = "pointer doesn't match stripe";
- goto err;
- }
-
- offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
- if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
- msg = "read is bigger than stripe";
- goto err;
- }
-
- ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
- if (ret) {
- msg = "-ENOMEM";
- goto err;
- }
-
- for (i = 0; i < v->nr_blocks; i++)
- ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-
- closure_sync(&cl);
-
- if (ec_nr_failed(buf) > v->nr_redundant) {
- msg = "unable to read enough blocks";
- goto err;
- }
-
- ec_validate_checksums(c, buf);
-
- ret = ec_do_recov(c, buf);
- if (ret)
- goto err;
-
- memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
- buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-out:
- ec_stripe_buf_exit(buf);
- kfree(buf);
- return ret;
-err:
- bch2_bkey_val_to_text(&msgbuf, c, orig_k);
- bch_err_ratelimited(c,
- "error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
- printbuf_exit(&msgbuf);
- ret = bch_err_throw(c, stripe_reconstruct);
- goto out;
-}
-
-/* stripe bucket accounting: */
-
-static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
-{
- if (c->gc_pos.phase != GC_PHASE_not_running &&
- !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
- return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
-
- return 0;
-}
-
-static int ec_stripe_mem_alloc(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- return allocate_dropping_locks_errcode(trans,
- __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
-}
-
-/*
- * Hash table of open stripes:
- * Stripes that are being created or modified are kept in a hash table, so that
- * stripe deletion can skip them.
- */
-
-static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
- unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
- struct ec_stripe_new *s;
-
- hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
- if (s->idx == idx)
- return true;
- return false;
-}
-
-static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
-{
- bool ret = false;
-
- spin_lock(&c->ec_stripes_new_lock);
- ret = __bch2_stripe_is_open(c, idx);
- spin_unlock(&c->ec_stripes_new_lock);
-
- return ret;
-}
-
-static bool bch2_try_open_stripe(struct bch_fs *c,
- struct ec_stripe_new *s,
- u64 idx)
-{
- bool ret;
-
- spin_lock(&c->ec_stripes_new_lock);
- ret = !__bch2_stripe_is_open(c, idx);
- if (ret) {
- unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
-
- s->idx = idx;
- hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
- }
- spin_unlock(&c->ec_stripes_new_lock);
-
- return ret;
-}
-
-static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
-{
- BUG_ON(!s->idx);
-
- spin_lock(&c->ec_stripes_new_lock);
- hlist_del_init(&s->hash);
- spin_unlock(&c->ec_stripes_new_lock);
-
- s->idx = 0;
-}
-
-/* stripe deletion */
-
-static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- /*
- * We expect write buffer races here
- * Important: check stripe_is_open with stripe key locked:
- */
- if (k.k->type == KEY_TYPE_stripe &&
- !bch2_stripe_is_open(trans->c, idx) &&
- stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * XXX
- * can we kill this and delete stripes from the trigger?
- */
-static void ec_stripe_delete_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, ec_stripe_delete_work);
-
- bch2_trans_run(c,
- bch2_btree_write_buffer_tryflush(trans) ?:
- for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
- 0, lru_k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
- ec_stripe_delete(trans, lru_k.k->p.offset);
- })));
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
-}
-
-void bch2_do_stripe_deletes(struct bch_fs *c)
-{
- if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
- !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
-}
-
-/* stripe creation: */
-
-static int ec_stripe_key_update(struct btree_trans *trans,
- struct bkey_i_stripe *old,
- struct bkey_i_stripe *new)
-{
- struct bch_fs *c = trans->c;
- bool create = !old;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
- c, "error %s stripe: got existing key type %s",
- create ? "creating" : "updating",
- bch2_bkey_types[k.k->type])) {
- ret = -EINVAL;
- goto err;
- }
-
- if (k.k->type == KEY_TYPE_stripe) {
- const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
-
- BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
- BUG_ON(old->v.nr_blocks != v->nr_blocks);
-
- for (unsigned i = 0; i < new->v.nr_blocks; i++) {
- unsigned sectors = stripe_blockcount_get(v, i);
-
- if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "stripe changed nonempty block %u", i);
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
- bch2_fs_inconsistent(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -EINVAL;
- goto err;
- }
-
- /*
- * If the stripe ptr changed underneath us, it must have
- * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
- */
- if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
- BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
-
- if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
- new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
- }
-
- stripe_blockcount_set(&new->v, i, sectors);
- }
- }
-
- ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int ec_stripe_update_extent(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bpos bucket, u8 gen,
- struct ec_stripe_buf *s,
- struct bkey_s_c_backpointer bp,
- struct bkey_buf *last_flushed)
-{
- struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ec_ptr = NULL;
- struct bch_extent_stripe_ptr stripe_ptr;
- struct bkey_i *n;
- int ret, dev, block;
-
- if (bp.v->level) {
- struct printbuf buf = PRINTBUF;
- struct btree_iter node_iter;
- struct btree *b;
-
- b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
- bch2_trans_iter_exit(trans, &node_iter);
-
- if (!b)
- return 0;
-
- prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
- bch2_bkey_val_to_text(&buf, c, bp.s_c);
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return bch_err_throw(c, erasure_coding_found_btree_node);
- }
-
- k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
- ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k) {
- /*
- * extent no longer exists - we could flush the btree
- * write buffer and retry to verify, but no need:
- */
- return 0;
- }
-
- if (extent_has_stripe_ptr(k, s->key.k.p.offset))
- goto out;
-
- ptr_c = bkey_matches_stripe(v, k, &block);
- /*
- * It doesn't generally make sense to erasure code cached ptrs:
- * XXX: should we be incrementing a counter?
- */
- if (!ptr_c || ptr_c->cached)
- goto out;
-
- dev = v->ptrs[block].dev;
-
- n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto out;
-
- bkey_reassemble(n, k);
-
- bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
- ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
- BUG_ON(!ec_ptr);
-
- stripe_ptr = (struct bch_extent_stripe_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
- .block = block,
- .redundancy = v->nr_redundant,
- .idx = s->key.k.p.offset,
- };
-
- __extent_entry_insert(n,
- (union bch_extent_entry *) ec_ptr,
- (union bch_extent_entry *) &stripe_ptr);
-
- ret = bch2_trans_update(trans, &iter, n, 0);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
- unsigned block)
-{
- struct bch_fs *c = trans->c;
- struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- struct bch_extent_ptr ptr = v->ptrs[block];
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
- if (!ca)
- return bch_err_throw(c, ENOENT_dev_not_found);
-
- struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
-
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, bucket_pos),
- bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc, ({
- if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
- break;
-
- if (bp_k.k->type != KEY_TYPE_backpointer)
- continue;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- if (bp.v->btree_id == BTREE_ID_stripes)
- continue;
-
- ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
- bp, &last_flushed);
- }));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch2_dev_put(ca);
- return ret;
-}
-
-static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- unsigned nr_data = v->nr_blocks - v->nr_redundant;
-
- int ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- for (unsigned i = 0; i < nr_data; i++) {
- ret = ec_stripe_update_bucket(trans, s, i);
- if (ret)
- break;
- }
-err:
- bch2_trans_put(trans);
- return ret;
-}
-
-static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
- struct ec_stripe_new *s,
- unsigned block,
- struct open_bucket *ob)
-{
- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
- BCH_DEV_WRITE_REF_ec_bucket_zero);
- if (!ca) {
- s->err = bch_err_throw(c, erofs_no_writes);
- return;
- }
-
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
- memset(s->new_stripe.data[block] + (offset << 9),
- 0,
- ob->sectors_free << 9);
-
- int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
- ob->bucket * ca->mi.bucket_size + offset,
- ob->sectors_free,
- GFP_KERNEL, 0);
-
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
-
- if (ret)
- s->err = ret;
-}
-
-void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
-{
- if (s->idx)
- bch2_stripe_close(c, s);
- kfree(s);
-}
-
-/*
- * data buckets of new stripe all written: create the stripe
- */
-static void ec_stripe_create(struct ec_stripe_new *s)
-{
- struct bch_fs *c = s->c;
- struct open_bucket *ob;
- struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- int ret;
-
- BUG_ON(s->h->s == s);
-
- closure_sync(&s->iodone);
-
- if (!s->err) {
- for (i = 0; i < nr_data; i++)
- if (s->blocks[i]) {
- ob = c->open_buckets + s->blocks[i];
-
- if (ob->sectors_free)
- zero_out_rest_of_ec_bucket(c, s, i, ob);
- }
- }
-
- if (s->err) {
- if (!bch2_err_matches(s->err, EROFS))
- bch_err(c, "error creating stripe: error writing data buckets");
- ret = s->err;
- goto err;
- }
-
- if (s->have_existing_stripe) {
- ec_validate_checksums(c, &s->existing_stripe);
-
- if (ec_do_recov(c, &s->existing_stripe)) {
- bch_err(c, "error creating stripe: error reading existing stripe");
- ret = bch_err_throw(c, ec_block_read);
- goto err;
- }
-
- for (i = 0; i < nr_data; i++)
- if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
- swap(s->new_stripe.data[i],
- s->existing_stripe.data[i]);
-
- ec_stripe_buf_exit(&s->existing_stripe);
- }
-
- BUG_ON(!s->allocated);
- BUG_ON(!s->idx);
-
- ec_generate_ec(&s->new_stripe);
-
- ec_generate_checksums(&s->new_stripe);
-
- /* write p/q: */
- for (i = nr_data; i < v->nr_blocks; i++)
- ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
- closure_sync(&s->iodone);
-
- if (ec_nr_failed(&s->new_stripe)) {
- bch_err(c, "error creating stripe: error writing redundancy buckets");
- ret = bch_err_throw(c, ec_block_write);
- goto err;
- }
-
- ret = bch2_trans_commit_do(c, &s->res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_key_update(trans,
- s->have_existing_stripe
- ? bkey_i_to_stripe(&s->existing_stripe.key)
- : NULL,
- bkey_i_to_stripe(&s->new_stripe.key)));
- bch_err_msg(c, ret, "creating stripe key");
- if (ret) {
- goto err;
- }
-
- ret = ec_stripe_update_extents(c, &s->new_stripe);
- bch_err_msg(c, ret, "error updating extents");
- if (ret)
- goto err;
-err:
- trace_stripe_create(c, s->idx, ret);
-
- bch2_disk_reservation_put(c, &s->res);
-
- for (i = 0; i < v->nr_blocks; i++)
- if (s->blocks[i]) {
- ob = c->open_buckets + s->blocks[i];
-
- if (i < nr_data) {
- ob->ec = NULL;
- __bch2_open_bucket_put(c, ob);
- } else {
- bch2_open_bucket_put(c, ob);
- }
- }
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_del(&s->list);
- mutex_unlock(&c->ec_stripe_new_lock);
- wake_up(&c->ec_stripe_new_wait);
-
- ec_stripe_buf_exit(&s->existing_stripe);
- ec_stripe_buf_exit(&s->new_stripe);
- closure_debug_destroy(&s->iodone);
-
- ec_stripe_new_put(c, s, STRIPE_REF_stripe);
-}
-
-static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
-{
- struct ec_stripe_new *s;
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list)
- if (!atomic_read(&s->ref[STRIPE_REF_io]))
- goto out;
- s = NULL;
-out:
- mutex_unlock(&c->ec_stripe_new_lock);
-
- return s;
-}
-
-static void ec_stripe_create_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work,
- struct bch_fs, ec_stripe_create_work);
- struct ec_stripe_new *s;
-
- while ((s = get_pending_stripe(c)))
- ec_stripe_create(s);
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
-}
-
-void bch2_ec_do_stripe_creates(struct bch_fs *c)
-{
- enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
-
- if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
-}
-
-static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct ec_stripe_new *s = h->s;
-
- lockdep_assert_held(&h->lock);
-
- BUG_ON(!s->allocated && !s->err);
-
- h->s = NULL;
- s->pending = true;
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_add(&s->list, &c->ec_stripe_new_list);
- mutex_unlock(&c->ec_stripe_new_lock);
-
- ec_stripe_new_put(c, s, STRIPE_REF_io);
-}
-
-static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
-{
- h->s->err = err;
- ec_stripe_new_set_pending(c, h);
-}
-
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
-{
- struct ec_stripe_new *s = ob->ec;
-
- s->err = err;
-}
-
-void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
-{
- struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- if (!ob)
- return NULL;
-
- BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
-
- struct bch_dev *ca = ob_dev(c, ob);
- unsigned offset = ca->mi.bucket_size - ob->sectors_free;
-
- return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
- unsigned l = *((const unsigned *) _l);
- unsigned r = *((const unsigned *) _r);
-
- return cmp_int(l, r);
-}
-
-/* pick most common bucket size: */
-static unsigned pick_blocksize(struct bch_fs *c,
- struct bch_devs_mask *devs)
-{
- unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
- struct {
- unsigned nr, size;
- } cur = { 0, 0 }, best = { 0, 0 };
-
- for_each_member_device_rcu(c, ca, devs)
- sizes[nr++] = ca->mi.bucket_size;
-
- sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
-
- for (unsigned i = 0; i < nr; i++) {
- if (sizes[i] != cur.size) {
- if (cur.nr > best.nr)
- best = cur;
-
- cur.nr = 0;
- cur.size = sizes[i];
- }
-
- cur.nr++;
- }
-
- if (cur.nr > best.nr)
- best = cur;
-
- return best.size;
-}
-
-static bool may_create_new_stripe(struct bch_fs *c)
-{
- return false;
-}
-
-static void ec_stripe_key_init(struct bch_fs *c,
- struct bkey_i *k,
- unsigned nr_data,
- unsigned nr_parity,
- unsigned stripe_size,
- unsigned disk_label)
-{
- struct bkey_i_stripe *s = bkey_stripe_init(k);
- unsigned u64s;
-
- s->v.sectors = cpu_to_le16(stripe_size);
- s->v.algorithm = 0;
- s->v.nr_blocks = nr_data + nr_parity;
- s->v.nr_redundant = nr_parity;
- s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
- s->v.csum_type = BCH_CSUM_crc32c;
- s->v.disk_label = disk_label;
-
- while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
- BUG_ON(1 << s->v.csum_granularity_bits >=
- le16_to_cpu(s->v.sectors) ||
- s->v.csum_granularity_bits == U8_MAX);
- s->v.csum_granularity_bits++;
- }
-
- set_bkey_val_u64s(&s->k, u64s);
-}
-
-static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct ec_stripe_new *s;
-
- lockdep_assert_held(&h->lock);
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return NULL;
-
- mutex_init(&s->lock);
- closure_init(&s->iodone, NULL);
- atomic_set(&s->ref[STRIPE_REF_stripe], 1);
- atomic_set(&s->ref[STRIPE_REF_io], 1);
- s->c = c;
- s->h = h;
- s->nr_data = min_t(unsigned, h->nr_active_devs,
- BCH_BKEY_PTRS_MAX) - h->redundancy;
- s->nr_parity = h->redundancy;
-
- ec_stripe_key_init(c, &s->new_stripe.key,
- s->nr_data, s->nr_parity,
- h->blocksize, h->disk_label);
- return s;
-}
-
-static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct bch_devs_mask devs = h->devs;
- unsigned nr_devs, nr_devs_with_durability;
-
- scoped_guard(rcu) {
- h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
- ? group_to_target(h->disk_label - 1)
- : 0);
- nr_devs = dev_mask_nr(&h->devs);
-
- for_each_member_device_rcu(c, ca, &h->devs)
- if (!ca->mi.durability)
- __clear_bit(ca->dev_idx, h->devs.d);
- nr_devs_with_durability = dev_mask_nr(&h->devs);
-
- h->blocksize = pick_blocksize(c, &h->devs);
-
- h->nr_active_devs = 0;
- for_each_member_device_rcu(c, ca, &h->devs)
- if (ca->mi.bucket_size == h->blocksize)
- h->nr_active_devs++;
- }
-
- /*
- * If we only have redundancy + 1 devices, we're better off with just
- * replication:
- */
- h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
-
- if (h->insufficient_devs) {
- const char *err;
-
- if (nr_devs < h->redundancy + 2)
- err = NULL;
- else if (nr_devs_with_durability < h->redundancy + 2)
- err = "cannot use durability=0 devices";
- else
- err = "mismatched bucket sizes";
-
- if (err)
- bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
- h->nr_active_devs, h->redundancy + 2, err);
- }
-
- struct bch_devs_mask devs_leaving;
- bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
-
- if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
- ec_stripe_new_cancel(c, h, -EINTR);
-
- h->rw_devs_change_count = c->rw_devs_change_count;
-}
-
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
- unsigned algo, unsigned redundancy,
- enum bch_watermark watermark)
-{
- struct ec_stripe_head *h;
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return NULL;
-
- mutex_init(&h->lock);
- BUG_ON(!mutex_trylock(&h->lock));
-
- h->disk_label = disk_label;
- h->algo = algo;
- h->redundancy = redundancy;
- h->watermark = watermark;
-
- list_add(&h->list, &c->ec_stripe_head_list);
- return h;
-}
-
-void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
-{
- if (h->s &&
- h->s->allocated &&
- bitmap_weight(h->s->blocks_allocated,
- h->s->nr_data) == h->s->nr_data)
- ec_stripe_new_set_pending(c, h);
-
- mutex_unlock(&h->lock);
-}
-
-static struct ec_stripe_head *
-__bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned disk_label,
- unsigned algo,
- unsigned redundancy,
- enum bch_watermark watermark)
-{
- struct bch_fs *c = trans->c;
- struct ec_stripe_head *h;
- int ret;
-
- if (!redundancy)
- return NULL;
-
- ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
- if (ret)
- return ERR_PTR(ret);
-
- if (test_bit(BCH_FS_going_ro, &c->flags)) {
- h = ERR_PTR(-BCH_ERR_erofs_no_writes);
- goto err;
- }
-
- list_for_each_entry(h, &c->ec_stripe_head_list, list)
- if (h->disk_label == disk_label &&
- h->algo == algo &&
- h->redundancy == redundancy &&
- h->watermark == watermark) {
- ret = bch2_trans_mutex_lock(trans, &h->lock);
- if (ret) {
- h = ERR_PTR(ret);
- goto err;
- }
- goto found;
- }
-
- h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
- if (!h) {
- h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
- goto err;
- }
-found:
- if (h->rw_devs_change_count != c->rw_devs_change_count)
- ec_stripe_head_devs_update(c, h);
-
- if (h->insufficient_devs) {
- mutex_unlock(&h->lock);
- h = NULL;
- }
-err:
- mutex_unlock(&c->ec_stripe_head_lock);
- return h;
-}
-
-static int new_stripe_alloc_buckets(struct btree_trans *trans,
- struct alloc_request *req,
- struct ec_stripe_head *h, struct ec_stripe_new *s,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- struct open_bucket *ob;
- struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
- int ret = 0;
-
- req->scratch_data_type = req->data_type;
- req->scratch_ptrs = req->ptrs;
- req->scratch_nr_replicas = req->nr_replicas;
- req->scratch_nr_effective = req->nr_effective;
- req->scratch_have_cache = req->have_cache;
- req->scratch_devs_may_alloc = req->devs_may_alloc;
-
- req->devs_may_alloc = h->devs;
- req->have_cache = true;
-
- BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
- BUG_ON(v->nr_redundant != s->nr_parity);
-
- /* * We bypass the sector allocator which normally does this: */
- bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
- c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
-
- for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
- /*
- * Note: we don't yet repair invalid blocks (failed/removed
- * devices) when reusing stripes - we still need a codepath to
- * walk backpointers and update all extents that point to that
- * block when updating the stripe
- */
- if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
- __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
-
- if (i < s->nr_data)
- nr_have_data++;
- else
- nr_have_parity++;
- }
-
- BUG_ON(nr_have_data > s->nr_data);
- BUG_ON(nr_have_parity > s->nr_parity);
-
- req->ptrs.nr = 0;
- if (nr_have_parity < s->nr_parity) {
- req->nr_replicas = s->nr_parity;
- req->nr_effective = nr_have_parity;
- req->data_type = BCH_DATA_parity;
-
- ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
-
- open_bucket_for_each(c, &req->ptrs, ob, i) {
- j = find_next_zero_bit(s->blocks_gotten,
- s->nr_data + s->nr_parity,
- s->nr_data);
- BUG_ON(j >= s->nr_data + s->nr_parity);
-
- s->blocks[j] = req->ptrs.v[i];
- v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, s->blocks_gotten);
- }
-
- if (ret)
- goto err;
- }
-
- req->ptrs.nr = 0;
- if (nr_have_data < s->nr_data) {
- req->nr_replicas = s->nr_data;
- req->nr_effective = nr_have_data;
- req->data_type = BCH_DATA_user;
-
- ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
-
- open_bucket_for_each(c, &req->ptrs, ob, i) {
- j = find_next_zero_bit(s->blocks_gotten,
- s->nr_data, 0);
- BUG_ON(j >= s->nr_data);
-
- s->blocks[j] = req->ptrs.v[i];
- v->ptrs[j] = bch2_ob_ptr(c, ob);
- __set_bit(j, s->blocks_gotten);
- }
-
- if (ret)
- goto err;
- }
-err:
- req->data_type = req->scratch_data_type;
- req->ptrs = req->scratch_ptrs;
- req->nr_replicas = req->scratch_nr_replicas;
- req->nr_effective = req->scratch_nr_effective;
- req->have_cache = req->scratch_have_cache;
- req->devs_may_alloc = req->scratch_devs_may_alloc;
- return ret;
-}
-
-static int __get_existing_stripe(struct btree_trans *trans,
- struct ec_stripe_head *head,
- struct ec_stripe_buf *stripe,
- u64 idx)
-{
- struct bch_fs *c = trans->c;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_stripes, POS(0, idx), 0);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- /* We expect write buffer races here */
- if (k.k->type != KEY_TYPE_stripe)
- goto out;
-
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- if (stripe_lru_pos(s.v) <= 1)
- goto out;
-
- if (s.v->disk_label == head->disk_label &&
- s.v->algorithm == head->algo &&
- s.v->nr_redundant == head->redundancy &&
- le16_to_cpu(s.v->sectors) == head->blocksize &&
- bch2_try_open_stripe(c, head->s, idx)) {
- bkey_reassemble(&stripe->key, k);
- ret = 1;
- }
-out:
- bch2_set_btree_iter_dontneed(trans, &iter);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
-{
- struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
- unsigned i;
-
- BUG_ON(existing_v->nr_redundant != s->nr_parity);
- s->nr_data = existing_v->nr_blocks -
- existing_v->nr_redundant;
-
- int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
- if (ret) {
- bch2_stripe_close(c, s);
- return ret;
- }
-
- BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
-
- /*
- * Free buckets we initially allocated - they might conflict with
- * blocks from the stripe we're reusing:
- */
- for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
- bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
- s->blocks[i] = 0;
- }
- memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
- memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
-
- for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
- if (stripe_blockcount_get(existing_v, i)) {
- __set_bit(i, s->blocks_gotten);
- __set_bit(i, s->blocks_allocated);
- }
-
- ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
- }
-
- bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
- s->have_existing_stripe = true;
-
- return 0;
-}
-
-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
- struct ec_stripe_new *s)
-{
- struct bch_fs *c = trans->c;
-
- /*
- * If we can't allocate a new stripe, and there's no stripes with empty
- * blocks for us to reuse, that means we have to wait on copygc:
- */
- if (may_create_new_stripe(c))
- return -1;
-
- struct btree_iter lru_iter;
- struct bkey_s_c lru_k;
- int ret = 0;
-
- for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
- lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
- 0, lru_k, ret) {
- ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &lru_iter);
- if (!ret)
- ret = bch_err_throw(c, stripe_alloc_blocked);
- if (ret == 1)
- ret = 0;
- if (ret)
- return ret;
-
- return init_new_stripe_from_existing(c, s);
-}
-
-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
- struct ec_stripe_new *s)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bpos min_pos = POS(0, 1);
- struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
- int ret;
-
- if (!s->res.sectors) {
- ret = bch2_disk_reservation_get(c, &s->res,
- h->blocksize,
- s->nr_parity,
- BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- return ret;
- }
-
- /*
- * Allocate stripe slot
- * XXX: we're going to need a bitrange btree of free stripes
- */
- for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
- if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
- if (start_pos.offset) {
- start_pos = min_pos;
- bch2_btree_iter_set_pos(trans, &iter, start_pos);
- continue;
- }
-
- ret = bch_err_throw(c, ENOSPC_stripe_create);
- break;
- }
-
- if (bkey_deleted(k.k) &&
- bch2_try_open_stripe(c, s, k.k->p.offset))
- break;
- }
-
- c->ec_stripe_hint = iter.pos.offset;
-
- if (ret)
- goto err;
-
- ret = ec_stripe_mem_alloc(trans, &iter);
- if (ret) {
- bch2_stripe_close(c, s);
- goto err;
- }
-
- s->new_stripe.key.k.p = iter.pos;
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-err:
- bch2_disk_reservation_put(c, &s->res);
- goto out;
-}
-
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
- struct alloc_request *req,
- unsigned algo,
- struct closure *cl)
-{
- struct bch_fs *c = trans->c;
- unsigned redundancy = req->nr_replicas - 1;
- unsigned disk_label = 0;
- struct target t = target_decode(req->target);
- bool waiting = false;
- int ret;
-
- if (t.type == TARGET_GROUP) {
- if (t.group > U8_MAX) {
- bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
- return NULL;
- }
- disk_label = t.group + 1; /* 0 == no label */
- }
-
- struct ec_stripe_head *h =
- __bch2_ec_stripe_head_get(trans, disk_label, algo,
- redundancy, req->watermark);
- if (IS_ERR_OR_NULL(h))
- return h;
-
- if (!h->s) {
- h->s = ec_new_stripe_alloc(c, h);
- if (!h->s) {
- ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
- bch_err(c, "failed to allocate new stripe");
- goto err;
- }
-
- h->nr_created++;
- }
-
- struct ec_stripe_new *s = h->s;
-
- if (s->allocated)
- goto allocated;
-
- if (s->have_existing_stripe)
- goto alloc_existing;
-
- /* First, try to allocate a full stripe: */
- enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
- swap(req->watermark, saved_watermark);
- ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h, s);
- swap(req->watermark, saved_watermark);
-
- if (!ret)
- goto allocate_buf;
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, ENOMEM))
- goto err;
-
- /*
- * Not enough buckets available for a full stripe: we must reuse an
- * existing stripe:
- */
- while (1) {
- ret = __bch2_ec_stripe_head_reuse(trans, h, s);
- if (!ret)
- break;
- if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
- goto err;
-
- if (req->watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
- __bch2_ec_stripe_head_reserve(trans, h, s);
- if (ret)
- goto err;
- goto allocate_buf;
- }
-
- /* XXX freelist_wait? */
- closure_wait(&c->freelist_wait, cl);
- waiting = true;
- }
-
- if (waiting)
- closure_wake_up(&c->freelist_wait);
-alloc_existing:
- /*
- * Retry allocating buckets, with the watermark for this
- * particular write:
- */
- ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
- if (ret)
- goto err;
-
-allocate_buf:
- ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
- if (ret)
- goto err;
-
- s->allocated = true;
-allocated:
- BUG_ON(!s->idx);
- BUG_ON(!s->new_stripe.data[0]);
- BUG_ON(trans->restarted);
- return h;
-err:
- bch2_ec_stripe_head_put(c, h);
- return ERR_PTR(ret);
-}
-
-/* device removal */
-
-int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- unsigned dev_idx,
- unsigned flags)
-{
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
-
- struct bch_fs *c = trans->c;
- struct bkey_i_stripe *s =
- bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
- int ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- struct disk_accounting_pos acc;
-
- s64 sectors = 0;
- for (unsigned i = 0; i < s->v.nr_blocks; i++)
- sectors -= stripe_blockcount_get(&s->v, i);
-
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
- acc.replicas.data_type = BCH_DATA_user;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
- if (ret)
- return ret;
-
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
-
- /* XXX: how much redundancy do we still have? check degraded flags */
-
- unsigned nr_good = 0;
-
- scoped_guard(rcu)
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr->dev == dev_idx)
- ptr->dev = BCH_SB_MEMBER_INVALID;
-
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
- }
-
- if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return bch_err_throw(c, remove_would_lose_data);
-
- unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
-
- if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
- return bch_err_throw(c, remove_would_lose_data);
-
- sectors = -sectors;
-
- memset(&acc, 0, sizeof(acc));
- acc.type = BCH_DISK_ACCOUNTING_replicas;
- bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
- acc.replicas.data_type = BCH_DATA_user;
- return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
-}
-
-static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
- unsigned flags)
-{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
- if (!a->stripe)
- return 0;
-
- if (a->stripe_sectors) {
- struct bch_fs *c = trans->c;
- bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
- return bch_err_throw(c, invalidate_stripe_to_dev);
- }
-
- struct btree_iter iter;
- struct bkey_s_c_stripe s =
- bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
- BTREE_ITER_slots, stripe);
- int ret = bkey_err(s);
- if (ret)
- return ret;
-
- ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_max_commit(trans, iter,
- BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
- BTREE_ITER_intent, k,
- NULL, NULL, 0, ({
- bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
- })));
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* startup/shutdown */
-
-static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
-{
- struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i;
-
- mutex_lock(&c->ec_stripe_head_lock);
- list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- mutex_lock(&h->lock);
- if (!h->s)
- goto unlock;
-
- if (!ca)
- goto found;
-
- for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
- if (!h->s->blocks[i])
- continue;
-
- ob = c->open_buckets + h->s->blocks[i];
- if (ob->dev == ca->dev_idx)
- goto found;
- }
- goto unlock;
-found:
- ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
-unlock:
- mutex_unlock(&h->lock);
- }
- mutex_unlock(&c->ec_stripe_head_lock);
-}
-
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
-{
- __bch2_ec_stop(c, ca);
-}
-
-void bch2_fs_ec_stop(struct bch_fs *c)
-{
- __bch2_ec_stop(c, NULL);
-}
-
-static bool bch2_fs_ec_flush_done(struct bch_fs *c)
-{
- sched_annotate_sleep();
-
- mutex_lock(&c->ec_stripe_new_lock);
- bool ret = list_empty(&c->ec_stripe_new_list);
- mutex_unlock(&c->ec_stripe_new_lock);
-
- return ret;
-}
-
-void bch2_fs_ec_flush(struct bch_fs *c)
-{
- wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
-}
-
-int bch2_stripes_read(struct bch_fs *c)
-{
- return 0;
-}
-
-static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
- struct ec_stripe_new *s)
-{
- prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
- s->idx, s->nr_data, s->nr_parity,
- bitmap_weight(s->blocks_allocated, s->nr_data),
- atomic_read(&s->ref[STRIPE_REF_io]),
- atomic_read(&s->ref[STRIPE_REF_stripe]),
- bch2_watermarks[s->h->watermark]);
-
- struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
- unsigned i;
- for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
- prt_printf(out, " %u", s->blocks[i]);
- prt_newline(out);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
- prt_newline(out);
-}
-
-void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct ec_stripe_head *h;
- struct ec_stripe_new *s;
-
- mutex_lock(&c->ec_stripe_head_lock);
- list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
- h->disk_label, h->algo, h->redundancy,
- bch2_watermarks[h->watermark],
- h->nr_created);
-
- if (h->s)
- bch2_new_stripe_to_text(out, c, h->s);
- }
- mutex_unlock(&c->ec_stripe_head_lock);
-
- prt_printf(out, "in flight:\n");
-
- mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list)
- bch2_new_stripe_to_text(out, c, s);
- mutex_unlock(&c->ec_stripe_new_lock);
-}
-
-void bch2_fs_ec_exit(struct bch_fs *c)
-{
- struct ec_stripe_head *h;
- unsigned i;
-
- while (1) {
- mutex_lock(&c->ec_stripe_head_lock);
- h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
- mutex_unlock(&c->ec_stripe_head_lock);
-
- if (!h)
- break;
-
- if (h->s) {
- for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
- BUG_ON(h->s->blocks[i]);
-
- kfree(h->s);
- }
- kfree(h);
- }
-
- BUG_ON(!list_empty(&c->ec_stripe_new_list));
-
- bioset_exit(&c->ec_bioset);
-}
-
-void bch2_fs_ec_init_early(struct bch_fs *c)
-{
- spin_lock_init(&c->ec_stripes_new_lock);
-
- INIT_LIST_HEAD(&c->ec_stripe_head_list);
- mutex_init(&c->ec_stripe_head_lock);
-
- INIT_LIST_HEAD(&c->ec_stripe_new_list);
- mutex_init(&c->ec_stripe_new_lock);
- init_waitqueue_head(&c->ec_stripe_new_wait);
-
- INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
- INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
-}
-
-int bch2_fs_ec_init(struct bch_fs *c)
-{
- return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
- BIOSET_NEED_BVECS);
-}
-
-static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- struct bkey_buf *last_flushed)
-{
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
-
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- u64 lru_idx = stripe_lru_pos(s.v);
- if (lru_idx) {
- int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
- k.k->p.offset, lru_idx, k, last_flushed);
- if (ret)
- return ret;
- }
- return 0;
-}
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
- POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
deleted file mode 100644
index 548048adf0d5..000000000000
--- a/fs/bcachefs/ec.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_H
-#define _BCACHEFS_EC_H
-
-#include "ec_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-
-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
- .key_validate = bch2_stripe_validate, \
- .val_to_text = bch2_stripe_to_text, \
- .swab = bch2_ptr_swab, \
- .trigger = bch2_trigger_stripe, \
- .min_val_size = 8, \
-})
-
-static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
-{
- return DIV_ROUND_UP(le16_to_cpu(s->sectors),
- 1 << s->csum_granularity_bits);
-}
-
-static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
- unsigned dev, unsigned csum_idx)
-{
- EBUG_ON(s->csum_type >= BCH_CSUM_NR);
-
- unsigned csum_bytes = bch_crc_bytes[s->csum_type];
-
- return sizeof(struct bch_stripe) +
- sizeof(struct bch_extent_ptr) * s->nr_blocks +
- (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
-}
-
-static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
- unsigned idx)
-{
- return stripe_csum_offset(s, s->nr_blocks, 0) +
- sizeof(u16) * idx;
-}
-
-static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
- unsigned idx)
-{
- return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
-}
-
-static inline void stripe_blockcount_set(struct bch_stripe *s,
- unsigned idx, unsigned v)
-{
- __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
-
- *p = cpu_to_le16(v);
-}
-
-static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
-{
- return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
- sizeof(u64));
-}
-
-static inline void *stripe_csum(struct bch_stripe *s,
- unsigned block, unsigned csum_idx)
-{
- EBUG_ON(block >= s->nr_blocks);
- EBUG_ON(csum_idx >= stripe_csums_per_device(s));
-
- return (void *) s + stripe_csum_offset(s, block, csum_idx);
-}
-
-static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
- unsigned block, unsigned csum_idx)
-{
- struct bch_csum csum = { 0 };
-
- memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
- return csum;
-}
-
-static inline void stripe_csum_set(struct bch_stripe *s,
- unsigned block, unsigned csum_idx,
- struct bch_csum csum)
-{
- memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
-}
-
-#define STRIPE_LRU_POS_EMPTY 1
-
-static inline u64 stripe_lru_pos(const struct bch_stripe *s)
-{
- if (!s)
- return 0;
-
- unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
-
- for (unsigned i = 0; i < nr_data; i++)
- blocks_empty += !stripe_blockcount_get(s, i);
-
- /* Will be picked up by the stripe_delete worker */
- if (blocks_empty == nr_data)
- return STRIPE_LRU_POS_EMPTY;
-
- if (!blocks_empty)
- return 0;
-
- /* invert: more blocks empty = reuse first */
- return LRU_TIME_MAX - blocks_empty;
-}
-
-static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
- const struct bch_extent_ptr *data_ptr,
- unsigned sectors)
-{
- return (data_ptr->dev == stripe_ptr->dev ||
- data_ptr->dev == BCH_SB_MEMBER_INVALID ||
- stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
- data_ptr->gen == stripe_ptr->gen &&
- data_ptr->offset >= stripe_ptr->offset &&
- data_ptr->offset < stripe_ptr->offset + sectors;
-}
-
-static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
- struct extent_ptr_decoded p)
-{
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
-
- BUG_ON(!p.has_ec);
-
- if (p.ec.block >= nr_data)
- return false;
-
- return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
- le16_to_cpu(s->sectors));
-}
-
-static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
- struct extent_ptr_decoded p)
-{
- unsigned nr_data = m->nr_blocks - m->nr_redundant;
-
- BUG_ON(!p.has_ec);
-
- if (p.ec.block >= nr_data)
- return false;
-
- return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
- m->sectors);
-}
-
-static inline void gc_stripe_unlock(struct gc_stripe *s)
-{
- BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
-
- clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
- smp_mb__after_atomic();
- wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
-}
-
-static inline void gc_stripe_lock(struct gc_stripe *s)
-{
- wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
- TASK_UNINTERRUPTIBLE);
-}
-
-struct bch_read_bio;
-
-struct ec_stripe_buf {
- /* might not be buffering the entire stripe: */
- unsigned offset;
- unsigned size;
- unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
-
- void *data[BCH_BKEY_PTRS_MAX];
-
- __BKEY_PADDED(key, 255);
-};
-
-struct ec_stripe_head;
-
-enum ec_stripe_ref {
- STRIPE_REF_io,
- STRIPE_REF_stripe,
- STRIPE_REF_NR
-};
-
-struct ec_stripe_new {
- struct bch_fs *c;
- struct ec_stripe_head *h;
- struct mutex lock;
- struct list_head list;
-
- struct hlist_node hash;
- u64 idx;
-
- struct closure iodone;
-
- atomic_t ref[STRIPE_REF_NR];
-
- int err;
-
- u8 nr_data;
- u8 nr_parity;
- bool allocated;
- bool pending;
- bool have_existing_stripe;
-
- unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
- unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
- open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
- struct disk_reservation res;
-
- struct ec_stripe_buf new_stripe;
- struct ec_stripe_buf existing_stripe;
-};
-
-struct ec_stripe_head {
- struct list_head list;
- struct mutex lock;
-
- unsigned disk_label;
- unsigned algo;
- unsigned redundancy;
- enum bch_watermark watermark;
- bool insufficient_devs;
-
- unsigned long rw_devs_change_count;
-
- u64 nr_created;
-
- struct bch_devs_mask devs;
- unsigned nr_active_devs;
-
- unsigned blocksize;
-
- struct dev_stripe_state block_stripe;
- struct dev_stripe_state parity_stripe;
-
- struct ec_stripe_new *s;
-};
-
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
-
-void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
-
-int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
-
-void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-
-struct alloc_request;
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
- struct alloc_request *, unsigned, struct closure *);
-
-void bch2_do_stripe_deletes(struct bch_fs *);
-void bch2_ec_do_stripe_creates(struct bch_fs *);
-void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-
-static inline void ec_stripe_new_get(struct ec_stripe_new *s,
- enum ec_stripe_ref ref)
-{
- atomic_inc(&s->ref[ref]);
-}
-
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
- enum ec_stripe_ref ref)
-{
- BUG_ON(atomic_read(&s->ref[ref]) <= 0);
-
- if (atomic_dec_and_test(&s->ref[ref]))
- switch (ref) {
- case STRIPE_REF_stripe:
- bch2_ec_stripe_new_free(c, s);
- break;
- case STRIPE_REF_io:
- bch2_ec_do_stripe_creates(c);
- break;
- default:
- BUG();
- }
-}
-
-int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *,
- struct bkey_s_c, unsigned, unsigned);
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned);
-
-void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-void bch2_fs_ec_stop(struct bch_fs *);
-void bch2_fs_ec_flush(struct bch_fs *);
-
-int bch2_stripes_read(struct bch_fs *);
-
-void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_ec_exit(struct bch_fs *);
-void bch2_fs_ec_init_early(struct bch_fs *);
-int bch2_fs_ec_init(struct bch_fs *);
-
-int bch2_check_stripe_to_lru_refs(struct bch_fs *);
-
-#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
deleted file mode 100644
index b9770f24f213..000000000000
--- a/fs/bcachefs/ec_format.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_FORMAT_H
-#define _BCACHEFS_EC_FORMAT_H
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
-
- /*
- * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
- *
- * we can manage with this because this only needs to point to a
- * disk label, not a target:
- */
- __u8 disk_label;
-
- /*
- * Variable length sections:
- * - Pointers
- * - Checksums
- * 2D array of [stripe block/device][csum block], with checksum block
- * size given by csum_granularity_bits
- * - Block sector counts: per-block array of u16s
- *
- * XXX:
- * Either checksums should have come last, or we should have included a
- * checksum_size field (the size in bytes of the checksum itself, not
- * the blocksize the checksum covers).
- *
- * Currently we aren't able to access the block sector counts if the
- * checksum type is unknown.
- */
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
deleted file mode 100644
index 809446c78951..000000000000
--- a/fs/bcachefs/ec_types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EC_TYPES_H
-#define _BCACHEFS_EC_TYPES_H
-
-#include "bcachefs_format.h"
-
-union bch_replicas_padded {
- u8 bytes[struct_size_t(struct bch_replicas_entry_v1,
- devs, BCH_BKEY_PTRS_MAX)];
- struct bch_replicas_entry_v1 e;
-};
-
-struct stripe {
- size_t heap_idx;
- u16 sectors;
- u8 algorithm;
- u8 nr_blocks;
- u8 nr_redundant;
- u8 blocks_nonempty;
- u8 disk_label;
-};
-
-struct gc_stripe {
- u8 lock;
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
- u16 sectors;
- u8 nr_blocks;
- u8 nr_redundant;
- u16 block_sectors[BCH_BKEY_PTRS_MAX];
- struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
-
- union bch_replicas_padded r;
-};
-
-#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c
deleted file mode 100644
index 56ab430f209f..000000000000
--- a/fs/bcachefs/enumerated_ref.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "enumerated_ref.h"
-#include "util.h"
-
-#include <linux/completion.h>
-
-#ifdef ENUMERATED_REF_DEBUG
-void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
-{
- BUG_ON(idx >= ref->nr);
- atomic_long_inc(&ref->refs[idx]);
-}
-
-bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
- BUG_ON(idx >= ref->nr);
- return atomic_long_inc_not_zero(&ref->refs[idx]);
-}
-
-bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
- BUG_ON(idx >= ref->nr);
- return !ref->dying &&
- atomic_long_inc_not_zero(&ref->refs[idx]);
-}
-
-void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
-{
- BUG_ON(idx >= ref->nr);
- long v = atomic_long_dec_return(&ref->refs[idx]);
-
- BUG_ON(v < 0);
- if (v)
- return;
-
- for (unsigned i = 0; i < ref->nr; i++)
- if (atomic_long_read(&ref->refs[i]))
- return;
-
- if (ref->stop_fn)
- ref->stop_fn(ref);
- complete(&ref->stop_complete);
-}
-#endif
-
-#ifndef ENUMERATED_REF_DEBUG
-static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref)
-{
- struct enumerated_ref *ref =
- container_of(percpu_ref, struct enumerated_ref, ref);
-
- if (ref->stop_fn)
- ref->stop_fn(ref);
- complete(&ref->stop_complete);
-}
-#endif
-
-void enumerated_ref_stop_async(struct enumerated_ref *ref)
-{
- reinit_completion(&ref->stop_complete);
-
-#ifndef ENUMERATED_REF_DEBUG
- percpu_ref_kill(&ref->ref);
-#else
- ref->dying = true;
- for (unsigned i = 0; i < ref->nr; i++)
- enumerated_ref_put(ref, i);
-#endif
-}
-
-void enumerated_ref_stop(struct enumerated_ref *ref,
- const char * const names[])
-{
- enumerated_ref_stop_async(ref);
- while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n");
- prt_str(&buf, "Outstanding refs:\n");
- enumerated_ref_to_text(&buf, ref, names);
- printk(KERN_ERR "%s", buf.buf);
- printbuf_exit(&buf);
- }
-}
-
-void enumerated_ref_start(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
- percpu_ref_reinit(&ref->ref);
-#else
- ref->dying = false;
- for (unsigned i = 0; i < ref->nr; i++) {
- BUG_ON(atomic_long_read(&ref->refs[i]));
- atomic_long_inc(&ref->refs[i]);
- }
-#endif
-}
-
-void enumerated_ref_exit(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
- percpu_ref_exit(&ref->ref);
-#else
- kfree(ref->refs);
- ref->refs = NULL;
- ref->nr = 0;
-#endif
-}
-
-int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr,
- void (*stop_fn)(struct enumerated_ref *))
-{
- init_completion(&ref->stop_complete);
- ref->stop_fn = stop_fn;
-
-#ifndef ENUMERATED_REF_DEBUG
- return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL);
-#else
- ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL);
- if (!ref->refs)
- return -ENOMEM;
-
- ref->nr = nr;
- return 0;
-#endif
-}
-
-void enumerated_ref_to_text(struct printbuf *out,
- struct enumerated_ref *ref,
- const char * const names[])
-{
-#ifdef ENUMERATED_REF_DEBUG
- bch2_printbuf_tabstop_push(out, 32);
-
- for (unsigned i = 0; i < ref->nr; i++)
- prt_printf(out, "%s\t%li\n", names[i],
- atomic_long_read(&ref->refs[i]));
-#else
- prt_str(out, "(not in debug mode)\n");
-#endif
-}
diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h
deleted file mode 100644
index ec01cf59ef80..000000000000
--- a/fs/bcachefs/enumerated_ref.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ENUMERATED_REF_H
-#define _BCACHEFS_ENUMERATED_REF_H
-
-#include "enumerated_ref_types.h"
-
-/*
- * A refcount where the users are enumerated: in debug mode, we create sepate
- * refcounts for each user, to make leaks and refcount errors easy to track
- * down:
- */
-
-#ifdef ENUMERATED_REF_DEBUG
-void enumerated_ref_get(struct enumerated_ref *, unsigned);
-bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned);
-bool enumerated_ref_tryget(struct enumerated_ref *, unsigned);
-void enumerated_ref_put(struct enumerated_ref *, unsigned);
-#else
-
-static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
-{
- percpu_ref_get(&ref->ref);
-}
-
-static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
- return percpu_ref_tryget(&ref->ref);
-}
-
-static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
-{
- return percpu_ref_tryget_live(&ref->ref);
-}
-
-static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
-{
- percpu_ref_put(&ref->ref);
-}
-#endif
-
-static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref)
-{
-#ifndef ENUMERATED_REF_DEBUG
- return percpu_ref_is_zero(&ref->ref);
-#else
- for (unsigned i = 0; i < ref->nr; i++)
- if (atomic_long_read(&ref->refs[i]))
- return false;
- return true;
-#endif
-}
-
-void enumerated_ref_stop_async(struct enumerated_ref *);
-void enumerated_ref_stop(struct enumerated_ref *, const char * const[]);
-void enumerated_ref_start(struct enumerated_ref *);
-
-void enumerated_ref_exit(struct enumerated_ref *);
-int enumerated_ref_init(struct enumerated_ref *, unsigned,
- void (*stop_fn)(struct enumerated_ref *));
-
-struct printbuf;
-void enumerated_ref_to_text(struct printbuf *,
- struct enumerated_ref *,
- const char * const[]);
-
-#endif /* _BCACHEFS_ENUMERATED_REF_H */
diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h
deleted file mode 100644
index 0e6076f466d3..000000000000
--- a/fs/bcachefs/enumerated_ref_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H
-#define _BCACHEFS_ENUMERATED_REF_TYPES_H
-
-#include <linux/percpu-refcount.h>
-
-struct enumerated_ref {
-#ifdef ENUMERATED_REF_DEBUG
- unsigned nr;
- bool dying;
- atomic_long_t *refs;
-#else
- struct percpu_ref ref;
-#endif
- void (*stop_fn)(struct enumerated_ref *);
- struct completion stop_complete;
-};
-
-#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
deleted file mode 100644
index c39cf304c681..000000000000
--- a/fs/bcachefs/errcode.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "errcode.h"
-#include "trace.h"
-
-#include <linux/errname.h>
-
-static const char * const bch2_errcode_strs[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
- BCH_ERRCODES()
-#undef x
- NULL
-};
-
-static const unsigned bch2_errcode_parents[] = {
-#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
- BCH_ERRCODES()
-#undef x
-};
-
-__attribute__((const))
-const char *bch2_err_str(int err)
-{
- const char *errstr;
-
- err = abs(err);
-
- BUG_ON(err >= BCH_ERR_MAX);
-
- if (err >= BCH_ERR_START)
- errstr = bch2_errcode_strs[err - BCH_ERR_START];
- else if (err)
- errstr = errname(err);
- else
- errstr = "(No error)";
- return errstr ?: "(Invalid error)";
-}
-
-__attribute__((const))
-bool __bch2_err_matches(int err, int class)
-{
- err = abs(err);
- class = abs(class);
-
- BUG_ON(err >= BCH_ERR_MAX);
- BUG_ON(class >= BCH_ERR_MAX);
-
- while (err >= BCH_ERR_START && err != class)
- err = bch2_errcode_parents[err - BCH_ERR_START];
-
- return err == class;
-}
-
-int __bch2_err_class(int bch_err)
-{
- int std_err = -bch_err;
- BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
-
- while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
- std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
-
- trace_error_downcast(bch_err, std_err, _RET_IP_);
-
- return -std_err;
-}
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
- if (status == BLK_STS_REMOVED)
- return "device removed";
- return blk_status_to_str(status);
-}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
deleted file mode 100644
index acc3b7b67704..000000000000
--- a/fs/bcachefs/errcode.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERRCODE_H
-#define _BCACHEFS_ERRCODE_H
-
-#define BCH_ERRCODES() \
- x(ERANGE, ERANGE_option_too_small) \
- x(ERANGE, ERANGE_option_too_big) \
- x(EINVAL, injected) \
- x(BCH_ERR_injected, injected_fs_start) \
- x(EINVAL, mount_option) \
- x(BCH_ERR_mount_option, option_name) \
- x(BCH_ERR_mount_option, option_value) \
- x(BCH_ERR_mount_option, option_not_bool) \
- x(ENOMEM, ENOMEM_stripe_buf) \
- x(ENOMEM, ENOMEM_replicas_table) \
- x(ENOMEM, ENOMEM_cpu_replicas) \
- x(ENOMEM, ENOMEM_replicas_gc) \
- x(ENOMEM, ENOMEM_disk_groups_validate) \
- x(ENOMEM, ENOMEM_disk_groups_to_cpu) \
- x(ENOMEM, ENOMEM_mark_snapshot) \
- x(ENOMEM, ENOMEM_mark_stripe) \
- x(ENOMEM, ENOMEM_mark_stripe_ptr) \
- x(ENOMEM, ENOMEM_btree_key_cache_create) \
- x(ENOMEM, ENOMEM_btree_key_cache_fill) \
- x(ENOMEM, ENOMEM_btree_key_cache_insert) \
- x(ENOMEM, ENOMEM_trans_kmalloc) \
- x(ENOMEM, ENOMEM_trans_log_msg) \
- x(ENOMEM, ENOMEM_do_encrypt) \
- x(ENOMEM, ENOMEM_ec_read_extent) \
- x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \
- x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \
- x(ENOMEM, ENOMEM_fs_btree_cache_init) \
- x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \
- x(ENOMEM, ENOMEM_fs_counters_init) \
- x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \
- x(ENOMEM, ENOMEM_io_clock_init) \
- x(ENOMEM, ENOMEM_blacklist_table_init) \
- x(ENOMEM, ENOMEM_sb_realloc_injected) \
- x(ENOMEM, ENOMEM_sb_bio_realloc) \
- x(ENOMEM, ENOMEM_sb_buf_realloc) \
- x(ENOMEM, ENOMEM_sb_journal_validate) \
- x(ENOMEM, ENOMEM_sb_journal_v2_validate) \
- x(ENOMEM, ENOMEM_journal_entry_add) \
- x(ENOMEM, ENOMEM_journal_read_buf_realloc) \
- x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\
- x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \
- x(ENOMEM, ENOMEM_bio_read_init) \
- x(ENOMEM, ENOMEM_bio_read_split_init) \
- x(ENOMEM, ENOMEM_bio_write_init) \
- x(ENOMEM, ENOMEM_bio_bounce_pages_init) \
- x(ENOMEM, ENOMEM_writepage_bioset_init) \
- x(ENOMEM, ENOMEM_dio_read_bioset_init) \
- x(ENOMEM, ENOMEM_dio_write_bioset_init) \
- x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
- x(ENOMEM, ENOMEM_promote_table_init) \
- x(ENOMEM, ENOMEM_async_obj_init) \
- x(ENOMEM, ENOMEM_compression_bounce_read_init) \
- x(ENOMEM, ENOMEM_compression_bounce_write_init) \
- x(ENOMEM, ENOMEM_compression_workspace_init) \
- x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \
- x(EIO, compression_workspace_not_initialized) \
- x(ENOMEM, ENOMEM_bucket_gens) \
- x(ENOMEM, ENOMEM_buckets_nouse) \
- x(ENOMEM, ENOMEM_usage_init) \
- x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \
- x(ENOMEM, ENOMEM_btree_node_reclaim) \
- x(ENOMEM, ENOMEM_btree_node_mem_alloc) \
- x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \
- x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\
- x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \
- x(ENOMEM, ENOMEM_set_nr_journal_buckets) \
- x(ENOMEM, ENOMEM_dev_journal_init) \
- x(ENOMEM, ENOMEM_journal_pin_fifo) \
- x(ENOMEM, ENOMEM_journal_buf) \
- x(ENOMEM, ENOMEM_gc_start) \
- x(ENOMEM, ENOMEM_gc_alloc_start) \
- x(ENOMEM, ENOMEM_gc_reflink_start) \
- x(ENOMEM, ENOMEM_gc_gens) \
- x(ENOMEM, ENOMEM_gc_repair_key) \
- x(ENOMEM, ENOMEM_fsck_extent_ends_at) \
- x(ENOMEM, ENOMEM_fsck_add_nlink) \
- x(ENOMEM, ENOMEM_journal_key_insert) \
- x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_read_superblock_clean) \
- x(ENOMEM, ENOMEM_fs_alloc) \
- x(ENOMEM, ENOMEM_fs_name_alloc) \
- x(ENOMEM, ENOMEM_fs_other_alloc) \
- x(ENOMEM, ENOMEM_dev_alloc) \
- x(ENOMEM, ENOMEM_disk_accounting) \
- x(ENOMEM, ENOMEM_stripe_head_alloc) \
- x(ENOMEM, ENOMEM_journal_read_bucket) \
- x(ENOSPC, ENOSPC_disk_reservation) \
- x(ENOSPC, ENOSPC_bucket_alloc) \
- x(ENOSPC, ENOSPC_disk_label_add) \
- x(ENOSPC, ENOSPC_stripe_create) \
- x(ENOSPC, ENOSPC_inode_create) \
- x(ENOSPC, ENOSPC_str_hash_create) \
- x(ENOSPC, ENOSPC_snapshot_create) \
- x(ENOSPC, ENOSPC_subvolume_create) \
- x(ENOSPC, ENOSPC_sb) \
- x(ENOSPC, ENOSPC_sb_journal) \
- x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \
- x(ENOSPC, ENOSPC_sb_quota) \
- x(ENOSPC, ENOSPC_sb_replicas) \
- x(ENOSPC, ENOSPC_sb_members) \
- x(ENOSPC, ENOSPC_sb_members_v2) \
- x(ENOSPC, ENOSPC_sb_crypt) \
- x(ENOSPC, ENOSPC_sb_downgrade) \
- x(ENOSPC, ENOSPC_btree_slot) \
- x(ENOSPC, ENOSPC_snapshot_tree) \
- x(ENOENT, ENOENT_bkey_type_mismatch) \
- x(ENOENT, ENOENT_str_hash_lookup) \
- x(ENOENT, ENOENT_str_hash_set_must_replace) \
- x(ENOENT, ENOENT_inode) \
- x(ENOENT, ENOENT_not_subvol) \
- x(ENOENT, ENOENT_not_directory) \
- x(ENOENT, ENOENT_directory_dead) \
- x(ENOENT, ENOENT_subvolume) \
- x(ENOENT, ENOENT_snapshot_tree) \
- x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
- x(ENOENT, ENOENT_dev_not_found) \
- x(ENOENT, ENOENT_dev_bucket_not_found) \
- x(ENOENT, ENOENT_dev_idx_not_found) \
- x(ENOENT, ENOENT_inode_no_backpointer) \
- x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
- x(ENOENT, btree_node_dying) \
- x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
- x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
- x(EEXIST, EEXIST_str_hash_set) \
- x(EEXIST, EEXIST_discard_in_flight_add) \
- x(EEXIST, EEXIST_subvolume_create) \
- x(ENOSPC, open_buckets_empty) \
- x(ENOSPC, freelist_empty) \
- x(BCH_ERR_freelist_empty, no_buckets_found) \
- x(0, transaction_restart) \
- x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
- x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
- x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
- x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
- x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\
- x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \
- x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
- x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
- x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
- x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
- x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
- x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
- x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
- x(BCH_ERR_transaction_restart, transaction_restart_nested) \
- x(BCH_ERR_transaction_restart, transaction_restart_commit) \
- x(0, no_btree_node) \
- x(BCH_ERR_no_btree_node, no_btree_node_relock) \
- x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
- x(BCH_ERR_no_btree_node, no_btree_node_drop) \
- x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \
- x(BCH_ERR_no_btree_node, no_btree_node_up) \
- x(BCH_ERR_no_btree_node, no_btree_node_down) \
- x(BCH_ERR_no_btree_node, no_btree_node_init) \
- x(BCH_ERR_no_btree_node, no_btree_node_cached) \
- x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \
- x(0, btree_insert_fail) \
- x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
- x(0, backpointer_to_overwritten_btree_node) \
- x(0, journal_reclaim_would_deadlock) \
- x(EINVAL, fsck) \
- x(BCH_ERR_fsck, fsck_ask) \
- x(BCH_ERR_fsck, fsck_fix) \
- x(BCH_ERR_fsck, fsck_delete_bkey) \
- x(BCH_ERR_fsck, fsck_ignore) \
- x(BCH_ERR_fsck, fsck_errors_not_fixed) \
- x(BCH_ERR_fsck, fsck_repair_unimplemented) \
- x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(EINVAL, recovery_will_run) \
- x(BCH_ERR_recovery_will_run, restart_recovery) \
- x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \
- x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \
- x(0, data_update_done) \
- x(0, bkey_was_deleted) \
- x(BCH_ERR_data_update_done, data_update_done_would_block) \
- x(BCH_ERR_data_update_done, data_update_done_unwritten) \
- x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
- x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
- x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
- x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
- x(EINVAL, device_state_not_allowed) \
- x(EINVAL, member_info_missing) \
- x(EINVAL, mismatched_block_size) \
- x(EINVAL, block_size_too_small) \
- x(EINVAL, bucket_size_too_small) \
- x(EINVAL, device_size_too_small) \
- x(EINVAL, device_size_too_big) \
- x(EINVAL, device_not_a_member_of_filesystem) \
- x(EINVAL, device_has_been_removed) \
- x(EINVAL, device_splitbrain) \
- x(EINVAL, device_already_online) \
- x(EINVAL, filesystem_uuid_already_open) \
- x(EINVAL, insufficient_devices_to_start) \
- x(EINVAL, invalid) \
- x(EINVAL, internal_fsck_err) \
- x(EINVAL, opt_parse_error) \
- x(EINVAL, remove_with_metadata_missing_unimplemented)\
- x(EINVAL, remove_would_lose_data) \
- x(EINVAL, no_resize_with_buckets_nouse) \
- x(EINVAL, inode_unpack_error) \
- x(EINVAL, inode_not_unlinked) \
- x(EINVAL, inode_has_child_snapshot) \
- x(EINVAL, varint_decode_error) \
- x(EINVAL, erasure_coding_found_btree_node) \
- x(EINVAL, option_negative) \
- x(EOPNOTSUPP, may_not_use_incompat_feature) \
- x(EROFS, erofs_trans_commit) \
- x(EROFS, erofs_no_writes) \
- x(EROFS, erofs_journal_err) \
- x(EROFS, erofs_sb_err) \
- x(EROFS, erofs_unfixed_errors) \
- x(EROFS, erofs_norecovery) \
- x(EROFS, erofs_nochanges) \
- x(EROFS, erofs_no_alloc_info) \
- x(EROFS, erofs_filesystem_full) \
- x(EROFS, insufficient_devices) \
- x(0, operation_blocked) \
- x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
- x(BCH_ERR_operation_blocked, journal_res_blocked) \
- x(BCH_ERR_journal_res_blocked, journal_blocked) \
- x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
- x(BCH_ERR_journal_res_blocked, journal_max_open) \
- x(BCH_ERR_journal_res_blocked, journal_full) \
- x(BCH_ERR_journal_res_blocked, journal_pin_full) \
- x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
- x(BCH_ERR_journal_res_blocked, journal_stuck) \
- x(BCH_ERR_journal_res_blocked, journal_retry_open) \
- x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
- x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
- x(BCH_ERR_invalid, invalid_sb) \
- x(BCH_ERR_invalid_sb, invalid_sb_magic) \
- x(BCH_ERR_invalid_sb, invalid_sb_version) \
- x(BCH_ERR_invalid_sb, invalid_sb_features) \
- x(BCH_ERR_invalid_sb, invalid_sb_too_big) \
- x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \
- x(BCH_ERR_invalid_sb, invalid_sb_csum) \
- x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
- x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
- x(BCH_ERR_invalid_sb, invalid_sb_offset) \
- x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
- x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
- x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
- x(BCH_ERR_invalid_sb, invalid_sb_field_size) \
- x(BCH_ERR_invalid_sb, invalid_sb_layout) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
- x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \
- x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
- x(BCH_ERR_invalid_sb, invalid_sb_members) \
- x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
- x(BCH_ERR_invalid_sb, invalid_sb_replicas) \
- x(BCH_ERR_invalid_sb, invalid_replicas_entry) \
- x(BCH_ERR_invalid_sb, invalid_sb_journal) \
- x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \
- x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
- x(BCH_ERR_invalid_sb, invalid_sb_clean) \
- x(BCH_ERR_invalid_sb, invalid_sb_quota) \
- x(BCH_ERR_invalid_sb, invalid_sb_errors) \
- x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
- x(BCH_ERR_invalid_sb, invalid_sb_ext) \
- x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
- x(BCH_ERR_invalid, invalid_bkey) \
- x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
- x(EROFS, journal_shutdown) \
- x(EIO, journal_flush_err) \
- x(EIO, journal_write_err) \
- x(EIO, btree_node_read_err) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
- x(EIO, sb_not_downgraded) \
- x(EIO, btree_node_write_all_failed) \
- x(EIO, btree_node_read_error) \
- x(EIO, btree_need_topology_repair) \
- x(EIO, bucket_ref_update) \
- x(EIO, trigger_alloc) \
- x(EIO, trigger_pointer) \
- x(EIO, trigger_stripe_pointer) \
- x(EIO, metadata_bucket_inconsistency) \
- x(EIO, mark_stripe) \
- x(EIO, stripe_reconstruct) \
- x(EIO, key_type_error) \
- x(EIO, extent_poisoned) \
- x(EIO, missing_indirect_extent) \
- x(EIO, invalidate_stripe_to_dev) \
- x(EIO, no_encryption_key) \
- x(EIO, insufficient_journal_devices) \
- x(EIO, device_offline) \
- x(EIO, EIO_fault_injected) \
- x(EIO, ec_block_read) \
- x(EIO, ec_block_write) \
- x(EIO, recompute_checksum) \
- x(EIO, decompress) \
- x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
- x(BCH_ERR_decompress, decompress_lz4) \
- x(BCH_ERR_decompress, decompress_gzip) \
- x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
- x(BCH_ERR_decompress, decompress_zstd) \
- x(EIO, data_write) \
- x(BCH_ERR_data_write, data_write_io) \
- x(BCH_ERR_data_write, data_write_csum) \
- x(BCH_ERR_data_write, data_write_invalid_ptr) \
- x(BCH_ERR_data_write, data_write_misaligned) \
- x(BCH_ERR_decompress, data_read) \
- x(BCH_ERR_data_read, no_device_to_read_from) \
- x(BCH_ERR_data_read, no_devices_valid) \
- x(BCH_ERR_data_read, data_read_io_err) \
- x(BCH_ERR_data_read, data_read_csum_err) \
- x(BCH_ERR_data_read, data_read_retry) \
- x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
- x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
- x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
- x(BCH_ERR_data_read, data_read_decompress_err) \
- x(BCH_ERR_data_read, data_read_decrypt_err) \
- x(BCH_ERR_data_read, data_read_ptr_stale_race) \
- x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
- x(BCH_ERR_data_read, data_read_no_encryption_key) \
- x(BCH_ERR_data_read, data_read_buffer_too_small) \
- x(BCH_ERR_data_read, data_read_key_overwritten) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \
- x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \
- x(0, nopromote) \
- x(BCH_ERR_nopromote, nopromote_may_not) \
- x(BCH_ERR_nopromote, nopromote_already_promoted) \
- x(BCH_ERR_nopromote, nopromote_unwritten) \
- x(BCH_ERR_nopromote, nopromote_congested) \
- x(BCH_ERR_nopromote, nopromote_in_flight) \
- x(BCH_ERR_nopromote, nopromote_no_writes) \
- x(BCH_ERR_nopromote, nopromote_enomem) \
- x(0, invalid_snapshot_node) \
- x(0, option_needs_open_fs) \
- x(0, remove_disk_accounting_entry)
-
-enum bch_errcode {
- BCH_ERR_START = 2048,
-#define x(class, err) BCH_ERR_##err,
- BCH_ERRCODES()
-#undef x
- BCH_ERR_MAX
-};
-
-__attribute__((const)) const char *bch2_err_str(int);
-
-__attribute__((const)) bool __bch2_err_matches(int, int);
-
-__attribute__((const))
-static inline bool _bch2_err_matches(int err, int class)
-{
- return err < 0 && __bch2_err_matches(err, class);
-}
-
-#define bch2_err_matches(_err, _class) \
-({ \
- BUILD_BUG_ON(!__builtin_constant_p(_class)); \
- unlikely(_bch2_err_matches(_err, _class)); \
-})
-
-int __bch2_err_class(int);
-
-static inline long bch2_err_class(long err)
-{
- return err < 0 ? __bch2_err_class(err) : err;
-}
-
-#define BLK_STS_REMOVED ((__force blk_status_t)128)
-
-#include <linux/blk_types.h>
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
deleted file mode 100644
index 267e73d9d7e6..000000000000
--- a/fs/bcachefs/error.c
+++ /dev/null
@@ -1,771 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "error.h"
-#include "journal.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "super.h"
-#include "thread_with_file.h"
-
-#define FSCK_ERR_RATELIMIT_NR 10
-
-void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out)
-{
- printbuf_indent_add_nextline(out, 2);
-
-#ifdef BCACHEFS_LOG_PREFIX
- prt_printf(out, "bcachefs (%s): ", fs_or_dev_name);
-#endif
-}
-
-bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
-{
- set_bit(BCH_FS_error, &c->flags);
-
- switch (c->opts.errors) {
- case BCH_ON_ERROR_continue:
- return false;
- case BCH_ON_ERROR_fix_safe:
- case BCH_ON_ERROR_ro:
- bch2_fs_emergency_read_only2(c, out);
- return true;
- case BCH_ON_ERROR_panic:
- bch2_print_str(c, KERN_ERR, out->buf);
- panic(bch2_fmt(c, "panic after error"));
- return true;
- default:
- BUG();
- }
-}
-
-bool bch2_inconsistent_error(struct bch_fs *c)
-{
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- printbuf_indent_add_nextline(&buf, 2);
-
- bool ret = __bch2_inconsistent_error(c, &buf);
- if (ret)
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-__printf(3, 0)
-static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans,
- const char *fmt, va_list args)
-{
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- bch2_log_msg_start(c, &buf);
-
- prt_vprintf(&buf, fmt, args);
- prt_newline(&buf);
-
- if (trans)
- bch2_trans_updates_to_text(&buf, trans);
- bool ret = __bch2_inconsistent_error(c, &buf);
- bch2_print_str(c, KERN_ERR, buf.buf);
-
- printbuf_exit(&buf);
- return ret;
-}
-
-bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- va_start(args, fmt);
- bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args);
- va_end(args);
- return ret;
-}
-
-bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...)
-{
- va_list args;
- va_start(args, fmt);
- bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args);
- va_end(args);
- return ret;
-}
-
-int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
-{
- prt_printf(out, "btree topology error: ");
-
- set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
- __bch2_inconsistent_error(c, out);
- return bch_err_throw(c, btree_need_topology_repair);
- } else {
- return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
- bch_err_throw(c, btree_need_topology_repair);
- }
-}
-
-int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_log_msg_start(c, &buf);
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(&buf, fmt, args);
- va_end(args);
-
- int ret = __bch2_topology_error(c, &buf);
- bch2_print_str(c, KERN_ERR, buf.buf);
-
- printbuf_exit(&buf);
- return ret;
-}
-
-void bch2_fatal_error(struct bch_fs *c)
-{
- if (bch2_fs_emergency_read_only(c))
- bch_err(c, "fatal error - emergency read only");
-}
-
-void bch2_io_error_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
- struct bch_fs *c = ca->fs;
-
- /* XXX: if it's reads or checksums that are failing, set it to failed */
-
- down_write(&c->state_lock);
- unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
-
- if (write_errors_start &&
- time_after(jiffies,
- write_errors_start + c->opts.write_error_timeout * HZ)) {
- if (ca->mi.state >= BCH_MEMBER_STATE_ro)
- goto out;
-
- bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
- struct printbuf buf = PRINTBUF;
- __bch2_log_msg_start(ca->name, &buf);
-
- prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
- c->opts.write_error_timeout,
- dev ? "device" : "filesystem");
- if (!dev)
- bch2_fs_emergency_read_only2(c, &buf);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-out:
- up_write(&c->state_lock);
-}
-
-void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
-{
- atomic64_inc(&ca->errors[type]);
-
- if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
- ca->write_errors_start = jiffies;
-
- queue_work(system_long_wq, &ca->io_error_work);
-}
-
-enum ask_yn {
- YN_NO,
- YN_YES,
- YN_ALLNO,
- YN_ALLYES,
-};
-
-static enum ask_yn parse_yn_response(char *buf)
-{
- buf = strim(buf);
-
- if (strlen(buf) == 1)
- switch (buf[0]) {
- case 'n':
- return YN_NO;
- case 'y':
- return YN_YES;
- case 'N':
- return YN_ALLNO;
- case 'Y':
- return YN_ALLYES;
- }
- return -1;
-}
-
-#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
- struct stdio_redirect *stdio = c->stdio;
-
- if (c->stdio_filter && c->stdio_filter != current)
- stdio = NULL;
-
- if (!stdio)
- return YN_NO;
-
- if (trans)
- bch2_trans_unlock(trans);
-
- unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
- darray_char line = {};
- int ret;
-
- do {
- unsigned long t;
- bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
-rewait:
- t = unlock_long_at
- ? max_t(long, unlock_long_at - jiffies, 0)
- : MAX_SCHEDULE_TIMEOUT;
-
- int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
- if (r == -ETIME) {
- bch2_trans_unlock_long(trans);
- unlock_long_at = 0;
- goto rewait;
- }
-
- if (r < 0) {
- ret = YN_NO;
- break;
- }
-
- darray_last(line) = '\0';
- } while ((ret = parse_yn_response(line.data)) < 0);
-
- darray_exit(&line);
- return ret;
-}
-#else
-
-#include "tools-util.h"
-
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
-{
- char *buf = NULL;
- size_t buflen = 0;
- int ret;
-
- do {
- fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
- fflush(stdout);
-
- if (getline(&buf, &buflen, stdin) < 0)
- die("error reading from standard input");
- } while ((ret = parse_yn_response(buf)) < 0);
-
- free(buf);
- return ret;
-}
-
-#endif
-
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
- enum bch_sb_error_id id)
-{
- struct fsck_err_state *s;
-
- list_for_each_entry(s, &c->fsck_error_msgs, list)
- if (s->id == id) {
- /*
- * move it to the head of the list: repeated fsck errors
- * are common
- */
- list_move(&s->list, &c->fsck_error_msgs);
- return s;
- }
-
- s = kzalloc(sizeof(*s), GFP_NOFS);
- if (!s) {
- if (!c->fsck_alloc_msgs_err)
- bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
- c->fsck_alloc_msgs_err = true;
- return NULL;
- }
-
- INIT_LIST_HEAD(&s->list);
- s->id = id;
- list_add(&s->list, &c->fsck_error_msgs);
- return s;
-}
-
-/* s/fix?/fixing/ s/recreate?/recreating/ */
-static void prt_actioning(struct printbuf *out, const char *action)
-{
- unsigned len = strlen(action);
-
- BUG_ON(action[len - 1] != '?');
- --len;
-
- if (action[len - 1] == 'e')
- --len;
-
- prt_bytes(out, action, len);
- prt_str(out, "ing");
-}
-
-static const u8 fsck_flags_extra[] = {
-#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
- BCH_SB_ERRS()
-#undef x
-};
-
-static int do_fsck_ask_yn(struct bch_fs *c,
- struct btree_trans *trans,
- struct printbuf *question,
- const char *action)
-{
- prt_str(question, ", ");
- prt_str(question, action);
-
- if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s", question->buf);
- else
- bch2_print_str(c, KERN_ERR, question->buf);
-
- int ask = bch2_fsck_ask_yn(c, trans);
-
- if (trans) {
- int ret = bch2_trans_relock(trans);
- if (ret)
- return ret;
- }
-
- return ask;
-}
-
-static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c,
- enum bch_sb_error_id id, const char *msg,
- bool *repeat, bool *print, bool *suppress)
-{
- bch2_sb_error_count(c, id);
-
- struct fsck_err_state *s = fsck_err_get(c, id);
- if (s) {
- /*
- * We may be called multiple times for the same error on
- * transaction restart - this memoizes instead of asking the user
- * multiple times for the same error:
- */
- if (s->last_msg && !strcmp(msg, s->last_msg)) {
- *repeat = true;
- *print = false;
- return s;
- }
-
- kfree(s->last_msg);
- s->last_msg = kstrdup(msg, GFP_KERNEL);
-
- if (c->opts.ratelimit_errors &&
- s->nr >= FSCK_ERR_RATELIMIT_NR) {
- if (s->nr == FSCK_ERR_RATELIMIT_NR)
- *suppress = true;
- else
- *print = false;
- }
-
- s->nr++;
- }
- return s;
-}
-
-bool __bch2_count_fsck_err(struct bch_fs *c,
- enum bch_sb_error_id id, struct printbuf *msg)
-{
- bch2_sb_error_count(c, id);
-
- mutex_lock(&c->fsck_error_msgs_lock);
- bool print = true, repeat = false, suppress = false;
-
- count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress);
- mutex_unlock(&c->fsck_error_msgs_lock);
-
- if (suppress)
- prt_printf(msg, "Ratelimiting new instances of previous error\n");
-
- return print && !repeat;
-}
-
-int bch2_fsck_err_opt(struct bch_fs *c,
- enum bch_fsck_flags flags,
- enum bch_sb_error_id err)
-{
- if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
- flags |= fsck_flags_extra[err];
-
- if (test_bit(BCH_FS_in_fsck, &c->flags)) {
- if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
- return bch_err_throw(c, fsck_repair_unimplemented);
-
- switch (c->opts.fix_errors) {
- case FSCK_FIX_exit:
- return bch_err_throw(c, fsck_errors_not_fixed);
- case FSCK_FIX_yes:
- if (flags & FSCK_CAN_FIX)
- return bch_err_throw(c, fsck_fix);
- fallthrough;
- case FSCK_FIX_no:
- if (flags & FSCK_CAN_IGNORE)
- return bch_err_throw(c, fsck_ignore);
- return bch_err_throw(c, fsck_errors_not_fixed);
- case FSCK_FIX_ask:
- if (flags & FSCK_AUTOFIX)
- return bch_err_throw(c, fsck_fix);
- return bch_err_throw(c, fsck_ask);
- default:
- BUG();
- }
- } else {
- if ((flags & FSCK_AUTOFIX) &&
- (c->opts.errors == BCH_ON_ERROR_continue ||
- c->opts.errors == BCH_ON_ERROR_fix_safe))
- return bch_err_throw(c, fsck_fix);
-
- if (c->opts.errors == BCH_ON_ERROR_continue &&
- (flags & FSCK_CAN_IGNORE))
- return bch_err_throw(c, fsck_ignore);
- return bch_err_throw(c, fsck_errors_not_fixed);
- }
-}
-
-int __bch2_fsck_err(struct bch_fs *c,
- struct btree_trans *trans,
- enum bch_fsck_flags flags,
- enum bch_sb_error_id err,
- const char *fmt, ...)
-{
- va_list args;
- struct printbuf buf = PRINTBUF, *out = &buf;
- int ret = 0;
- const char *action_orig = "fix?", *action = action_orig;
-
- might_sleep();
-
- if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
- flags |= fsck_flags_extra[err];
-
- if (!c)
- c = trans->c;
-
- /*
- * Ugly: if there's a transaction in the current task it has to be
- * passed in to unlock if we prompt for user input.
- *
- * But, plumbing a transaction and transaction restarts into
- * bkey_validate() is problematic.
- *
- * So:
- * - make all bkey errors AUTOFIX, they're simple anyways (we just
- * delete the key)
- * - and we don't need to warn if we're not prompting
- */
- WARN_ON((flags & FSCK_CAN_FIX) &&
- !(flags & FSCK_AUTOFIX) &&
- !trans &&
- bch2_current_has_btree_trans(c));
-
- if (test_bit(err, c->sb.errors_silent))
- return flags & FSCK_CAN_FIX
- ? bch_err_throw(c, fsck_fix)
- : bch_err_throw(c, fsck_ignore);
-
- printbuf_indent_add_nextline(out, 2);
-
-#ifdef BCACHEFS_LOG_PREFIX
- if (strncmp(fmt, "bcachefs", 8))
- prt_printf(out, bch2_log_msg(c, ""));
-#endif
-
- va_start(args, fmt);
- prt_vprintf(out, fmt, args);
- va_end(args);
-
- /* Custom fix/continue/recreate/etc.? */
- if (out->buf[out->pos - 1] == '?') {
- const char *p = strrchr(out->buf, ',');
- if (p) {
- out->pos = p - out->buf;
- action = kstrdup(p + 2, GFP_KERNEL);
- if (!action) {
- ret = -ENOMEM;
- goto err;
- }
- }
- }
-
- mutex_lock(&c->fsck_error_msgs_lock);
- bool repeat = false, print = true, suppress = false;
- bool inconsistent = false, exiting = false;
- struct fsck_err_state *s =
- count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress);
- if (repeat) {
- ret = s->ret;
- goto err_unlock;
- }
-
- if ((flags & FSCK_AUTOFIX) &&
- (c->opts.errors == BCH_ON_ERROR_continue ||
- c->opts.errors == BCH_ON_ERROR_fix_safe)) {
- prt_str(out, ", ");
- if (flags & FSCK_CAN_FIX) {
- prt_actioning(out, action);
- ret = bch_err_throw(c, fsck_fix);
- } else {
- prt_str(out, ", continuing");
- ret = bch_err_throw(c, fsck_ignore);
- }
-
- goto print;
- } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) {
- if (c->opts.errors != BCH_ON_ERROR_continue ||
- !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
- prt_str_indented(out, ", shutting down\n"
- "error not marked as autofix and not in fsck\n"
- "run fsck, and forward to devs so error can be marked for self-healing");
- inconsistent = true;
- print = true;
- ret = bch_err_throw(c, fsck_errors_not_fixed);
- } else if (flags & FSCK_CAN_FIX) {
- prt_str(out, ", ");
- prt_actioning(out, action);
- ret = bch_err_throw(c, fsck_fix);
- } else {
- prt_str(out, ", continuing");
- ret = bch_err_throw(c, fsck_ignore);
- }
- } else if (c->opts.fix_errors == FSCK_FIX_exit) {
- prt_str(out, ", exiting");
- ret = bch_err_throw(c, fsck_errors_not_fixed);
- } else if (flags & FSCK_CAN_FIX) {
- int fix = s && s->fix
- ? s->fix
- : c->opts.fix_errors;
-
- if (fix == FSCK_FIX_ask) {
- print = false;
-
- ret = do_fsck_ask_yn(c, trans, out, action);
- if (ret < 0)
- goto err_unlock;
-
- if (ret >= YN_ALLNO && s)
- s->fix = ret == YN_ALLNO
- ? FSCK_FIX_no
- : FSCK_FIX_yes;
-
- ret = ret & 1
- ? bch_err_throw(c, fsck_fix)
- : bch_err_throw(c, fsck_ignore);
- } else if (fix == FSCK_FIX_yes ||
- (c->opts.nochanges &&
- !(flags & FSCK_CAN_IGNORE))) {
- prt_str(out, ", ");
- prt_actioning(out, action);
- ret = bch_err_throw(c, fsck_fix);
- } else {
- prt_str(out, ", not ");
- prt_actioning(out, action);
- ret = bch_err_throw(c, fsck_ignore);
- }
- } else {
- if (flags & FSCK_CAN_IGNORE) {
- prt_str(out, ", continuing");
- ret = bch_err_throw(c, fsck_ignore);
- } else {
- prt_str(out, " (repair unimplemented)");
- ret = bch_err_throw(c, fsck_repair_unimplemented);
- }
- }
-
- if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
- (c->opts.fix_errors == FSCK_FIX_exit ||
- !(flags & FSCK_CAN_IGNORE)))
- ret = bch_err_throw(c, fsck_errors_not_fixed);
-
- if (test_bit(BCH_FS_in_fsck, &c->flags) &&
- (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
- !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
- exiting = true;
- print = true;
- }
-print:
- prt_newline(out);
-
- if (inconsistent)
- __bch2_inconsistent_error(c, out);
- else if (exiting)
- prt_printf(out, "Unable to continue, halting\n");
- else if (suppress)
- prt_printf(out, "Ratelimiting new instances of previous error\n");
-
- if (print) {
- /* possibly strip an empty line, from printbuf_indent_add */
- while (out->pos && out->buf[out->pos - 1] == ' ')
- --out->pos;
- printbuf_nul_terminate(out);
-
- if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s", out->buf);
- else
- bch2_print_str(c, KERN_ERR, out->buf);
- }
-
- if (s)
- s->ret = ret;
-
- if (trans &&
- !(flags & FSCK_ERR_NO_LOG) &&
- ret == -BCH_ERR_fsck_fix)
- ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret;
-err_unlock:
- mutex_unlock(&c->fsck_error_msgs_lock);
-err:
- /*
- * We don't yet track whether the filesystem currently has errors, for
- * log_fsck_err()s: that would require us to track for every error type
- * which recovery pass corrects it, to get the fsck exit status correct:
- */
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- /* nothing */
- } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
- }
-
- if (action != action_orig)
- kfree(action);
- printbuf_exit(&buf);
-
- BUG_ON(!ret);
- return ret;
-}
-
-static const char * const bch2_bkey_validate_contexts[] = {
-#define x(n) #n,
- BKEY_VALIDATE_CONTEXTS()
-#undef x
- NULL
-};
-
-int __bch2_bkey_fsck_err(struct bch_fs *c,
- struct bkey_s_c k,
- struct bkey_validate_context from,
- enum bch_sb_error_id err,
- const char *fmt, ...)
-{
- if (from.flags & BCH_VALIDATE_silent)
- return bch_err_throw(c, fsck_delete_bkey);
-
- unsigned fsck_flags = 0;
- if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
- if (test_bit(err, c->sb.errors_silent))
- return bch_err_throw(c, fsck_delete_bkey);
-
- fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
- }
- if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
- fsck_flags |= fsck_flags_extra[err];
-
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "invalid bkey in %s",
- bch2_bkey_validate_contexts[from.from]);
-
- if (from.from == BKEY_VALIDATE_journal)
- prt_printf(&buf, " journal seq=%llu offset=%u",
- from.journal_seq, from.journal_offset);
-
- prt_str(&buf, " btree=");
- bch2_btree_id_to_text(&buf, from.btree);
- prt_printf(&buf, " level=%u: ", from.level);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(&buf, fmt, args);
- va_end(args);
-
- int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
-{
- struct fsck_err_state *s, *n;
-
- mutex_lock(&c->fsck_error_msgs_lock);
-
- list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
- if (print && s->ratelimited && s->last_msg)
- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
-
- list_del(&s->list);
- kfree(s->last_msg);
- kfree(s);
- }
-
- mutex_unlock(&c->fsck_error_msgs_lock);
-}
-
-void bch2_flush_fsck_errs(struct bch_fs *c)
-{
- __bch2_flush_fsck_errs(c, true);
-}
-
-void bch2_free_fsck_errs(struct bch_fs *c)
-{
- __bch2_flush_fsck_errs(c, false);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- subvol_inum inum, u64 offset)
-{
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- if (inum.subvol) {
- ret = bch2_inum_to_path(trans, inum, out);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
- }
- if (!inum.subvol || ret)
- prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
- prt_printf(out, " offset %llu: ", offset);
-
- return trans_was_restarted(trans, restart_count);
-}
-
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
- subvol_inum inum, u64 offset)
-{
- bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
-}
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- struct bpos pos)
-{
- int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out);
- if (ret)
- return ret;
-
- prt_printf(out, " offset %llu: ", pos.offset << 8);
- return 0;
-}
-
-void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
- struct bpos pos)
-{
- bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
-}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
deleted file mode 100644
index 0c3c3a24fc6f..000000000000
--- a/fs/bcachefs/error.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_ERROR_H
-#define _BCACHEFS_ERROR_H
-
-#include <linux/list.h>
-#include <linux/printk.h>
-#include "bkey_types.h"
-#include "sb-errors.h"
-
-struct bch_dev;
-struct bch_fs;
-struct work_struct;
-
-/*
- * XXX: separate out errors that indicate on disk data is inconsistent, and flag
- * superblock as such
- */
-
-/* Error messages: */
-
-void __bch2_log_msg_start(const char *, struct printbuf *);
-
-static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
-{
- __bch2_log_msg_start(c->name, out);
-}
-
-/*
- * Inconsistency errors: The on disk data is inconsistent. If these occur during
- * initial recovery, they don't indicate a bug in the running code - we walk all
- * the metadata before modifying anything. If they occur at runtime, they
- * indicate either a bug in the running code or (less likely) data is being
- * silently corrupted under us.
- *
- * XXX: audit all inconsistent errors and make sure they're all recoverable, in
- * BCH_ON_ERROR_CONTINUE mode
- */
-
-bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *);
-bool bch2_inconsistent_error(struct bch_fs *);
-__printf(2, 3)
-bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...);
-
-#define bch2_fs_inconsistent_on(cond, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- if (_ret) \
- bch2_fs_inconsistent(__VA_ARGS__); \
- _ret; \
-})
-
-__printf(2, 3)
-bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...);
-
-#define bch2_trans_inconsistent_on(cond, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- if (_ret) \
- bch2_trans_inconsistent(__VA_ARGS__); \
- _ret; \
-})
-
-int __bch2_topology_error(struct bch_fs *, struct printbuf *);
-__printf(2, 3)
-int bch2_fs_topology_error(struct bch_fs *, const char *, ...);
-
-/*
- * Fsck errors: inconsistency errors we detect at mount time, and should ideally
- * be able to repair:
- */
-
-struct fsck_err_state {
- struct list_head list;
- enum bch_sb_error_id id;
- u64 nr;
- bool ratelimited;
- int ret;
- int fix;
- char *last_msg;
-};
-
-#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-
-bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *);
-#define bch2_count_fsck_err(_c, _err, ...) \
- __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
-
-int bch2_fsck_err_opt(struct bch_fs *,
- enum bch_fsck_flags,
- enum bch_sb_error_id);
-
-__printf(5, 6) __cold
-int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
- enum bch_fsck_flags,
- enum bch_sb_error_id,
- const char *, ...);
-#define bch2_fsck_err(c, _flags, _err_type, ...) \
- __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
- type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
- _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
-
-void bch2_flush_fsck_errs(struct bch_fs *);
-void bch2_free_fsck_errs(struct bch_fs *);
-
-#define fsck_err_wrap(_do) \
-({ \
- int _ret = _do; \
- if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
- !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \
- ret = _ret; \
- goto fsck_err; \
- } \
- \
- bch2_err_matches(_ret, BCH_ERR_fsck_fix); \
-})
-
-#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
-
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
-#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
-({ \
- might_sleep(); \
- \
- if (type_is(c, struct bch_fs *)) \
- WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
- \
- (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
-})
-
-#define mustfix_fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define mustfix_fsck_err_on(cond, c, _err_type, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-
-#define fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define fsck_err_on(cond, c, _err_type, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err(c, _err_type, ...) \
- __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-
-#define log_fsck_err_on(cond, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- if (_ret) \
- log_fsck_err(__VA_ARGS__); \
- _ret; \
-})
-
-enum bch_validate_flags;
-__printf(5, 6)
-int __bch2_bkey_fsck_err(struct bch_fs *,
- struct bkey_s_c,
- struct bkey_validate_context from,
- enum bch_sb_error_id,
- const char *, ...);
-
-/*
- * for now, bkey fsck errors are always handled by deleting the entire key -
- * this will change at some point
- */
-#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
-do { \
- int _ret = __bch2_bkey_fsck_err(c, k, from, \
- BCH_FSCK_ERR_##_err_type, \
- _err_msg, ##__VA_ARGS__); \
- if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
- !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \
- ret = _ret; \
- ret = bch_err_throw(c, fsck_delete_bkey); \
- goto fsck_err; \
-} while (0)
-
-#define bkey_fsck_err_on(cond, ...) \
-do { \
- if (unlikely(cond)) \
- bkey_fsck_err(__VA_ARGS__); \
-} while (0)
-
-/*
- * Fatal errors: these don't indicate a bug, but we can't continue running in RW
- * mode - pretty much just due to metadata IO errors:
- */
-
-void bch2_fatal_error(struct bch_fs *);
-
-#define bch2_fs_fatal_error(c, _msg, ...) \
-do { \
- bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__); \
- bch2_fatal_error(c); \
-} while (0)
-
-#define bch2_fs_fatal_err_on(cond, c, ...) \
-({ \
- bool _ret = unlikely(!!(cond)); \
- \
- if (_ret) \
- bch2_fs_fatal_error(c, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * IO errors: either recoverable metadata IO (because we have replicas), or data
- * IO - we need to log it and print out a message, but we don't (necessarily)
- * want to shut down the fs:
- */
-
-void bch2_io_error_work(struct work_struct *);
-
-/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-static inline void bch2_account_io_success_fail(struct bch_dev *ca,
- enum bch_member_error_type type,
- bool success)
-{
- if (likely(success)) {
- if (type == BCH_MEMBER_ERROR_write &&
- ca->write_errors_start)
- ca->write_errors_start = 0;
- } else {
- bch2_io_error(ca, type);
- }
-}
-
-static inline void bch2_account_io_completion(struct bch_dev *ca,
- enum bch_member_error_type type,
- u64 submit_time, bool success)
-{
- if (unlikely(!ca))
- return;
-
- if (type != BCH_MEMBER_ERROR_checksum)
- bch2_latency_acct(ca, submit_time, type);
-
- bch2_account_io_success_fail(ca, type, success);
-}
-
-int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
-
-void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
-
-int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
-void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
-
-#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
deleted file mode 100644
index e76e58a568bf..000000000000
--- a/fs/bcachefs/extent_update.c
+++ /dev/null
@@ -1,155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "debug.h"
-#include "extents.h"
-#include "extent_update.h"
-
-/*
- * This counts the number of iterators to the alloc & ec btrees we'll need
- * inserting/removing this extent:
- */
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- unsigned ret = 0, lru = 0;
-
- bkey_extent_entry_for_each(ptrs, entry) {
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- /* Might also be updating LRU btree */
- if (entry->ptr.cached)
- lru++;
-
- fallthrough;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ret++;
- }
- }
-
- /*
- * Updating keys in the alloc btree may also update keys in the
- * freespace or discard btrees:
- */
- return lru + ret * 2;
-}
-
-#define EXTENT_ITERS_MAX 64
-
-static int count_iters_for_insert(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned offset,
- struct bpos *end,
- unsigned *nr_iters)
-{
- int ret = 0, ret2 = 0;
-
- if (*nr_iters >= EXTENT_ITERS_MAX) {
- *end = bpos_min(*end, k.k->p);
- ret = 1;
- }
-
- switch (k.k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
- if (*nr_iters >= EXTENT_ITERS_MAX) {
- *end = bpos_min(*end, k.k->p);
- ret = 1;
- }
-
- break;
- case KEY_TYPE_reflink_p: {
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx = REFLINK_P_IDX(p.v);
- unsigned sectors = bpos_min(*end, p.k->p).offset -
- bkey_start_offset(p.k);
- struct btree_iter iter;
- struct bkey_s_c r_k;
-
- for_each_btree_key_norestart(trans, iter,
- BTREE_ID_reflink, POS(0, idx + offset),
- BTREE_ITER_slots, r_k, ret2) {
- if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
- break;
-
- /* extent_update_to_keys(), for the reflink_v update */
- *nr_iters += 1;
-
- *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
- if (*nr_iters >= EXTENT_ITERS_MAX) {
- struct bpos pos = bkey_start_pos(k.k);
- pos.offset += min_t(u64, k.k->size,
- r_k.k->p.offset - idx);
-
- *end = bpos_min(*end, pos);
- ret = 1;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- break;
- }
- }
-
- return ret2 ?: ret;
-}
-
-int bch2_extent_atomic_end(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos *end)
-{
- unsigned nr_iters = 0;
-
- struct btree_iter copy;
- bch2_trans_copy_iter(trans, &copy, iter);
-
- int ret = bch2_btree_iter_traverse(trans, &copy);
- if (ret)
- goto err;
-
- struct bkey_s_c k;
- for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) {
- unsigned offset = 0;
-
- if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
- offset = iter->pos.offset - bkey_start_offset(k.k);
-
- ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
- if (ret)
- break;
- }
-err:
- bch2_trans_iter_exit(trans, &copy);
- return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *k)
-{
- struct bpos end = k->k.p;
- int ret = bch2_extent_atomic_end(trans, iter, &end);
- if (ret)
- return ret;
-
- /* tracepoint */
-
- if (bpos_lt(end, k->k.p)) {
- if (trace_extent_trim_atomic_enabled()) {
- CLASS(printbuf, buf)();
- bch2_bpos_to_text(&buf, end);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
- trace_extent_trim_atomic(trans->c, buf.buf);
- }
- bch2_cut_back(end, k);
- }
- return 0;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
deleted file mode 100644
index 34467db53f45..000000000000
--- a/fs/bcachefs/extent_update.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENT_UPDATE_H
-#define _BCACHEFS_EXTENT_UPDATE_H
-
-#include "bcachefs.h"
-
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
- struct bpos *);
-int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
- struct bkey_i *);
-
-#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
deleted file mode 100644
index 83cbd77dcb9c..000000000000
--- a/fs/bcachefs/extents.c
+++ /dev/null
@@ -1,1735 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- *
- * Code for managing the extent btree and dynamically updating the writeback
- * dirty sector count.
- */
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "journal.h"
-#include "rebalance.h"
-#include "replicas.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-#include "util.h"
-
-static const char * const bch2_extent_flags_strs[] = {
-#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
- BCH_EXTENT_FLAGS()
-#undef x
- NULL,
-};
-
-static unsigned bch2_crc_field_size_max[] = {
- [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
- [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
- [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
-
-static void bch2_extent_crc_pack(union bch_extent_crc *,
- struct bch_extent_crc_unpacked,
- enum bch_extent_entry_type);
-
-void bch2_io_failures_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_io_failures *failed)
-{
- static const char * const error_types[] = {
- "btree validate", "io", "checksum", "ec reconstruct", NULL
- };
-
- for (struct bch_dev_io_failures *f = failed->devs;
- f < failed->devs + failed->nr;
- f++) {
- unsigned errflags =
- ((!!f->failed_btree_validate) << 0) |
- ((!!f->failed_io) << 1) |
- ((!!f->failed_csum_nr) << 2) |
- ((!!f->failed_ec) << 3);
-
- bch2_printbuf_make_room(out, 1024);
- out->atomic++;
- scoped_guard(rcu) {
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
- if (ca)
- prt_str(out, ca->name);
- else
- prt_printf(out, "(invalid device %u)", f->dev);
- }
- --out->atomic;
-
- prt_char(out, ' ');
-
- if (!errflags) {
- prt_str(out, "no error - confused");
- } else if (is_power_of_2(errflags)) {
- prt_bitflags(out, error_types, errflags);
- prt_str(out, " error");
- } else {
- prt_str(out, "errors: ");
- prt_bitflags(out, error_types, errflags);
- }
- prt_newline(out);
- }
-}
-
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
- unsigned dev)
-{
- struct bch_dev_io_failures *i;
-
- for (i = f->devs; i < f->devs + f->nr; i++)
- if (i->dev == dev)
- return i;
-
- return NULL;
-}
-
-void bch2_mark_io_failure(struct bch_io_failures *failed,
- struct extent_ptr_decoded *p,
- bool csum_error)
-{
- struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
-
- if (!f) {
- BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
- f = &failed->devs[failed->nr++];
- memset(f, 0, sizeof(*f));
- f->dev = p->ptr.dev;
- }
-
- if (p->do_ec_reconstruct)
- f->failed_ec = true;
- else if (!csum_error)
- f->failed_io = true;
- else
- f->failed_csum_nr++;
-}
-
-void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
- unsigned dev)
-{
- struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
-
- if (!f) {
- BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
-
- f = &failed->devs[failed->nr++];
- memset(f, 0, sizeof(*f));
- f->dev = dev;
- }
-
- f->failed_btree_validate = true;
-}
-
-static inline u64 dev_latency(struct bch_dev *ca)
-{
- return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
-}
-
-static inline int dev_failed(struct bch_dev *ca)
-{
- return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
-}
-
-/*
- * returns true if p1 is better than p2:
- */
-static inline bool ptr_better(struct bch_fs *c,
- const struct extent_ptr_decoded p1,
- u64 p1_latency,
- struct bch_dev *ca1,
- const struct extent_ptr_decoded p2,
- u64 p2_latency)
-{
- struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
-
- int failed_delta = dev_failed(ca1) - dev_failed(ca2);
- if (unlikely(failed_delta))
- return failed_delta < 0;
-
- if (static_branch_unlikely(&bch2_force_reconstruct_read))
- return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
-
- if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
- return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
-
- int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
- if (unlikely(crc_retry_delta))
- return crc_retry_delta < 0;
-
- /* Pick at random, biased in favor of the faster device: */
-
- return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
-}
-
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_failures *failed,
- struct extent_ptr_decoded *pick,
- int dev)
-{
- bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
- bool have_dirty_ptrs = false, have_pick = false;
-
- if (k.k->type == KEY_TYPE_error)
- return bch_err_throw(c, key_type_error);
-
- rcu_read_lock();
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- u64 pick_latency;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- have_dirty_ptrs |= !p.ptr.cached;
-
- /*
- * Unwritten extent: no need to actually read, treat it as a
- * hole and return 0s:
- */
- if (p.ptr.unwritten) {
- rcu_read_unlock();
- return 0;
- }
-
- /* Are we being asked to read from a specific device? */
- if (dev >= 0 && p.ptr.dev != dev)
- continue;
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
-
- if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
- rcu_read_unlock();
- int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
- if (ret)
- return ret;
- rcu_read_lock();
- }
-
- if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
- continue;
-
- struct bch_dev_io_failures *f =
- unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
- if (unlikely(f)) {
- p.crc_retry_nr = f->failed_csum_nr;
- p.has_ec &= ~f->failed_ec;
-
- if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
- have_io_errors |= f->failed_io;
- have_io_errors |= f->failed_btree_validate;
- have_io_errors |= f->failed_ec;
- }
- have_csum_errors |= !!f->failed_csum_nr;
-
- if (p.has_ec && (f->failed_io || f->failed_csum_nr))
- p.do_ec_reconstruct = true;
- else if (f->failed_io ||
- f->failed_btree_validate ||
- f->failed_csum_nr > c->opts.checksum_err_retry_nr)
- continue;
- }
-
- have_missing_devs |= ca && !bch2_dev_is_online(ca);
-
- if (!ca || !bch2_dev_is_online(ca)) {
- if (!p.has_ec)
- continue;
- p.do_ec_reconstruct = true;
- }
-
- if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
- p.do_ec_reconstruct = true;
-
- u64 p_latency = dev_latency(ca);
- /*
- * Square the latencies, to bias more in favor of the faster
- * device - we never want to stop issuing reads to the slower
- * device altogether, so that we can update our latency numbers:
- */
- p_latency *= p_latency;
-
- if (!have_pick ||
- ptr_better(c,
- p, p_latency, ca,
- *pick, pick_latency)) {
- *pick = p;
- pick_latency = p_latency;
- have_pick = true;
- }
- }
- rcu_read_unlock();
-
- if (have_pick)
- return 1;
- if (!have_dirty_ptrs)
- return 0;
- if (have_missing_devs)
- return bch_err_throw(c, no_device_to_read_from);
- if (have_csum_errors)
- return bch_err_throw(c, data_read_csum_err);
- if (have_io_errors)
- return bch_err_throw(c, data_read_io_err);
-
- /*
- * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
- * but they don't point to valid devices:
- */
- return bch_err_throw(c, no_devices_valid);
-}
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
- c, btree_ptr_val_too_big,
- "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
- c, btree_ptr_v2_val_too_big,
- "value too big (%zu > %zu)",
- bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-
- bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
- c, btree_ptr_v2_min_key_bad,
- "min_key > key");
-
- if ((from.flags & BCH_VALIDATE_write) &&
- c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
- bkey_fsck_err_on(!bp.v->sectors_written,
- c, btree_ptr_v2_written_0,
- "sectors_written == 0");
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
- prt_printf(out, "seq %llx written %u min_key %s",
- le64_to_cpu(bp.v->seq),
- le16_to_cpu(bp.v->sectors_written),
- BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
-
- bch2_bpos_to_text(out, bp.v->min_key);
- prt_printf(out, " ");
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
- unsigned big_endian, int write,
- struct bkey_s k)
-{
- struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
-
- compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id_is_extents(btree_id) &&
- !bkey_eq(bp.v->min_key, POS_MIN))
- bp.v->min_key = write
- ? bpos_nosnap_predecessor(bp.v->min_key)
- : bpos_nosnap_successor(bp.v->min_key);
-}
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
-{
- struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
- struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
- union bch_extent_entry *en_l;
- const union bch_extent_entry *en_r;
- struct extent_ptr_decoded lp, rp;
- bool use_right_ptr;
-
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
- if (extent_entry_type(en_l) != extent_entry_type(en_r))
- return false;
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- if (en_l < l_ptrs.end || en_r < r_ptrs.end)
- return false;
-
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- lp.crc = bch2_extent_crc_unpack(l.k, NULL);
- rp.crc = bch2_extent_crc_unpack(r.k, NULL);
-
- guard(rcu)();
-
- while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
- __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
- if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
- rp.ptr.offset + rp.crc.offset ||
- lp.ptr.dev != rp.ptr.dev ||
- lp.ptr.gen != rp.ptr.gen ||
- lp.ptr.unwritten != rp.ptr.unwritten ||
- lp.has_ec != rp.has_ec)
- return false;
-
- /* Extents may not straddle buckets: */
- struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
- bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
-
- if (!same_bucket)
- return false;
-
- if (lp.has_ec != rp.has_ec ||
- (lp.has_ec &&
- (lp.ec.block != rp.ec.block ||
- lp.ec.redundancy != rp.ec.redundancy ||
- lp.ec.idx != rp.ec.idx)))
- return false;
-
- if (lp.crc.compression_type != rp.crc.compression_type ||
- lp.crc.nonce != rp.crc.nonce)
- return false;
-
- if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
- lp.crc.uncompressed_size) {
- /* can use left extent's crc entry */
- } else if (lp.crc.live_size <= rp.crc.offset) {
- /* can use right extent's crc entry */
- } else {
- /* check if checksums can be merged: */
- if (lp.crc.csum_type != rp.crc.csum_type ||
- lp.crc.nonce != rp.crc.nonce ||
- crc_is_compressed(lp.crc) ||
- !bch2_checksum_mergeable(lp.crc.csum_type))
- return false;
-
- if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
- rp.crc.offset)
- return false;
-
- if (lp.crc.csum_type &&
- lp.crc.uncompressed_size +
- rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
- return false;
- }
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
- if (extent_entry_is_crc(en_l)) {
- struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- if (crc_l.uncompressed_size + crc_r.uncompressed_size >
- bch2_crc_field_size_max[extent_entry_type(en_l)])
- return false;
- }
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- use_right_ptr = false;
- en_l = l_ptrs.start;
- en_r = r_ptrs.start;
- while (en_l < l_ptrs.end) {
- if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
- use_right_ptr)
- en_l->ptr = en_r->ptr;
-
- if (extent_entry_is_crc(en_l)) {
- struct bch_extent_crc_unpacked crc_l =
- bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- struct bch_extent_crc_unpacked crc_r =
- bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- use_right_ptr = false;
-
- if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
- crc_l.uncompressed_size) {
- /* can use left extent's crc entry */
- } else if (crc_l.live_size <= crc_r.offset) {
- /* can use right extent's crc entry */
- crc_r.offset -= crc_l.live_size;
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
- extent_entry_type(en_l));
- use_right_ptr = true;
- } else {
- crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
- crc_l.csum,
- crc_r.csum,
- crc_r.uncompressed_size << 9);
-
- crc_l.uncompressed_size += crc_r.uncompressed_size;
- crc_l.compressed_size += crc_r.compressed_size;
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
- extent_entry_type(en_l));
- }
- }
-
- en_l = extent_entry_next(en_l);
- en_r = extent_entry_next(en_r);
- }
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- int ret = 0;
-
- bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
- c, reservation_key_nr_replicas_invalid,
- "invalid nr_replicas (%u)", r.v->nr_replicas);
-fsck_err:
- return ret;
-}
-
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- prt_printf(out, "generation %u replicas %u",
- le32_to_cpu(r.v->generation),
- r.v->nr_replicas);
-}
-
-bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reservation l = bkey_s_to_reservation(_l);
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
-
- if (l.v->generation != r.v->generation ||
- l.v->nr_replicas != r.v->nr_replicas)
- return false;
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-/* Extent checksum entries: */
-
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
- struct bch_extent_crc_unpacked r)
-{
- return (l.csum_type != r.csum_type ||
- l.compression_type != r.compression_type ||
- l.compressed_size != r.compressed_size ||
- l.uncompressed_size != r.uncompressed_size ||
- l.offset != r.offset ||
- l.live_size != r.live_size ||
- l.nonce != r.nonce ||
- bch2_crc_cmp(l.csum, r.csum));
-}
-
-static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
- struct bch_extent_crc_unpacked n)
-{
- return !crc_is_compressed(u) &&
- u.csum_type &&
- u.uncompressed_size > u.live_size &&
- bch2_csum_type_is_encryption(u.csum_type) ==
- bch2_csum_type_is_encryption(n.csum_type);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
- struct bch_extent_crc_unpacked n)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
-
- if (!n.csum_type)
- return false;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (can_narrow_crc(crc, n))
- return true;
-
- return false;
-}
-
-/*
- * We're writing another replica for this extent, so while we've got the data in
- * memory we'll be computing a new checksum for the currently live data.
- *
- * If there are other replicas we aren't moving, and they are checksummed but
- * not compressed, we can modify them to point to only the data that is
- * currently live (so that readers won't have to bounce) while we've got the
- * checksum we need:
- */
-bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- struct bch_extent_crc_unpacked u;
- struct extent_ptr_decoded p;
- union bch_extent_entry *i;
- bool ret = false;
-
- /* Find a checksum entry that covers only live data: */
- if (!n.csum_type) {
- bkey_for_each_crc(&k->k, ptrs, u, i)
- if (!crc_is_compressed(u) &&
- u.csum_type &&
- u.live_size == u.uncompressed_size) {
- n = u;
- goto found;
- }
- return false;
- }
-found:
- BUG_ON(crc_is_compressed(n));
- BUG_ON(n.offset);
- BUG_ON(n.live_size != k->k.size);
-
-restart_narrow_pointers:
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
- bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
- if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
- p.ptr.offset += p.crc.offset;
- p.crc = n;
- bch2_extent_ptr_decoded_append(k, &p);
- ret = true;
- goto restart_narrow_pointers;
- }
-
- return ret;
-}
-
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
- struct bch_extent_crc_unpacked src,
- enum bch_extent_entry_type type)
-{
-#define common_fields(_src) \
- .type = BIT(type), \
- .csum_type = _src.csum_type, \
- .compression_type = _src.compression_type, \
- ._compressed_size = _src.compressed_size - 1, \
- ._uncompressed_size = _src.uncompressed_size - 1, \
- .offset = _src.offset
-
- switch (type) {
- case BCH_EXTENT_ENTRY_crc32:
- dst->crc32 = (struct bch_extent_crc32) {
- common_fields(src),
- .csum = (u32 __force) *((__le32 *) &src.csum.lo),
- };
- break;
- case BCH_EXTENT_ENTRY_crc64:
- dst->crc64 = (struct bch_extent_crc64) {
- common_fields(src),
- .nonce = src.nonce,
- .csum_lo = (u64 __force) src.csum.lo,
- .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
- };
- break;
- case BCH_EXTENT_ENTRY_crc128:
- dst->crc128 = (struct bch_extent_crc128) {
- common_fields(src),
- .nonce = src.nonce,
- .csum = src.csum,
- };
- break;
- default:
- BUG();
- }
-#undef set_common_fields
-}
-
-void bch2_extent_crc_append(struct bkey_i *k,
- struct bch_extent_crc_unpacked new)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- union bch_extent_crc *crc = (void *) ptrs.end;
- enum bch_extent_entry_type type;
-
- if (bch_crc_bytes[new.csum_type] <= 4 &&
- new.uncompressed_size <= CRC32_SIZE_MAX &&
- new.nonce <= CRC32_NONCE_MAX)
- type = BCH_EXTENT_ENTRY_crc32;
- else if (bch_crc_bytes[new.csum_type] <= 10 &&
- new.uncompressed_size <= CRC64_SIZE_MAX &&
- new.nonce <= CRC64_NONCE_MAX)
- type = BCH_EXTENT_ENTRY_crc64;
- else if (bch_crc_bytes[new.csum_type] <= 16 &&
- new.uncompressed_size <= CRC128_SIZE_MAX &&
- new.nonce <= CRC128_NONCE_MAX)
- type = BCH_EXTENT_ENTRY_crc128;
- else
- BUG();
-
- bch2_extent_crc_pack(crc, new, type);
-
- k->k.u64s += extent_entry_u64s(ptrs.end);
-
- EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
-}
-
-/* Generic code for keys with pointers: */
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
- return bch2_bkey_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
- return k.k->type == KEY_TYPE_reservation
- ? bkey_s_c_to_reservation(k).v->nr_replicas
- : bch2_bkey_dirty_devs(k).nr;
-}
-
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
-{
- unsigned ret = 0;
-
- if (k.k->type == KEY_TYPE_reservation) {
- ret = bkey_s_c_to_reservation(k).v->nr_replicas;
- } else {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- ret += !p.ptr.cached && !crc_is_compressed(p.crc);
- }
-
- return ret;
-}
-
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned ret = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached && crc_is_compressed(p.crc))
- ret += p.crc.compressed_size;
-
- return ret;
-}
-
-bool bch2_bkey_is_incompressible(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
-
- bkey_for_each_crc(k.k, ptrs, crc, entry)
- if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
- return true;
- return false;
-}
-
-unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p = { 0 };
- unsigned replicas = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.cached)
- continue;
-
- if (p.has_ec)
- replicas += p.ec.redundancy;
-
- replicas++;
-
- }
-
- return replicas;
-}
-
-static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
-{
- if (p->ptr.cached)
- return 0;
-
- return p->has_ec
- ? p->ec.redundancy + 1
- : ca->mi.durability;
-}
-
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
- struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
- return ca ? __extent_ptr_durability(ca, p) : 0;
-}
-
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
-{
- struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
-
- if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
- return 0;
-
- return __extent_ptr_durability(ca, p);
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned durability = 0;
-
- guard(rcu)();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- durability += bch2_extent_ptr_durability(c, &p);
- return durability;
-}
-
-static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned durability = 0;
-
- guard(rcu)();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
- durability += bch2_extent_ptr_durability(c, &p);
- return durability;
-}
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
- union bch_extent_entry *next = extent_entry_next(entry);
-
- memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
- k->k.u64s -= extent_entry_u64s(entry);
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *k,
- struct extent_ptr_decoded *p)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- struct bch_extent_crc_unpacked crc =
- bch2_extent_crc_unpack(&k->k, NULL);
- union bch_extent_entry *pos;
-
- if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = ptrs.start;
- goto found;
- }
-
- bkey_for_each_crc(&k->k, ptrs, crc, pos)
- if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = extent_entry_next(pos);
- goto found;
- }
-
- bch2_extent_crc_append(k, p->crc);
- pos = bkey_val_end(bkey_i_to_s(k));
-found:
- p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- __extent_entry_insert(k, pos, to_entry(&p->ptr));
-
- if (p->has_ec) {
- p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
- __extent_entry_insert(k, pos, to_entry(&p->ec));
- }
-}
-
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
- union bch_extent_entry *entry)
-{
- union bch_extent_entry *i = ptrs.start;
-
- if (i == entry)
- return NULL;
-
- while (extent_entry_next(i) != entry)
- i = extent_entry_next(i);
- return i;
-}
-
-/*
- * Returns pointer to the next entry after the one being dropped:
- */
-void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry = to_entry(ptr), *next;
- bool drop_crc = true;
-
- if (k.k->type == KEY_TYPE_stripe) {
- ptr->dev = BCH_SB_MEMBER_INVALID;
- return;
- }
-
- EBUG_ON(ptr < &ptrs.start->ptr ||
- ptr >= &ptrs.end->ptr);
- EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-
- for (next = extent_entry_next(entry);
- next != ptrs.end;
- next = extent_entry_next(next)) {
- if (extent_entry_is_crc(next)) {
- break;
- } else if (extent_entry_is_ptr(next)) {
- drop_crc = false;
- break;
- }
- }
-
- extent_entry_drop(k, entry);
-
- while ((entry = extent_entry_prev(ptrs, entry))) {
- if (extent_entry_is_ptr(entry))
- break;
-
- if ((extent_entry_is_crc(entry) && drop_crc) ||
- extent_entry_is_stripe_ptr(entry))
- extent_entry_drop(k, entry);
- }
-}
-
-void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
-{
- if (k.k->type != KEY_TYPE_stripe) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == ptr->dev && p.has_ec) {
- ptr->dev = BCH_SB_MEMBER_INVALID;
- return;
- }
- }
-
- bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
-
- bch2_bkey_drop_ptr_noerror(k, ptr);
-
- /*
- * If we deleted all the dirty pointers and there's still cached
- * pointers, we could set the cached pointers to dirty if they're not
- * stale - but to do that correctly we'd need to grab an open_bucket
- * reference so that we don't race with bucket reuse:
- */
- if (have_dirty &&
- !bch2_bkey_dirty_devs(k.s_c).nr) {
- k.k->type = KEY_TYPE_error;
- set_bkey_val_u64s(k.k, 0);
- } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
- k.k->type = KEY_TYPE_deleted;
- set_bkey_val_u64s(k.k, 0);
- }
-}
-
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
-{
- bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
-}
-
-void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
-{
- bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
-}
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == dev)
- return ptr;
-
- return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_dev *ca;
-
- guard(rcu)();
- bkey_for_each_ptr(ptrs, ptr)
- if (bch2_dev_in_target(c, ptr->dev, target) &&
- (ca = bch2_dev_rcu(c, ptr->dev)) &&
- (!ptr->cached ||
- !dev_ptr_stale_rcu(ca, ptr)))
- return true;
-
- return false;
-}
-
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
- struct bch_extent_ptr m, u64 offset)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == m.dev &&
- p.ptr.gen == m.gen &&
- (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
- (s64) m.offset - offset)
- return true;
-
- return false;
-}
-
-/*
- * Returns true if two extents refer to the same data:
- */
-bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
-{
- if (k1.k->type != k2.k->type)
- return false;
-
- if (bkey_extent_is_direct_data(k1.k)) {
- struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
- struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
- const union bch_extent_entry *entry1, *entry2;
- struct extent_ptr_decoded p1, p2;
-
- if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
- return false;
-
- bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
- bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
- if (p1.ptr.dev == p2.ptr.dev &&
- p1.ptr.gen == p2.ptr.gen &&
-
- /*
- * This checks that the two pointers point
- * to the same region on disk - adjusting
- * for the difference in where the extents
- * start, since one may have been trimmed:
- */
- (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
-
- /*
- * This additionally checks that the
- * extents overlap on disk, since the
- * previous check may trigger spuriously
- * when one extent is immediately partially
- * overwritten with another extent (so that
- * on disk they are adjacent) and
- * compression is in use:
- */
- ((p1.ptr.offset >= p2.ptr.offset &&
- p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
- (p2.ptr.offset >= p1.ptr.offset &&
- p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
- return true;
-
- return false;
- } else {
- /* KEY_TYPE_deleted, etc. */
- return true;
- }
-}
-
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
-{
- struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
- union bch_extent_entry *entry2;
- struct extent_ptr_decoded p2;
-
- bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
- if (p1.ptr.dev == p2.ptr.dev &&
- p1.ptr.gen == p2.ptr.gen &&
- (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
- return &entry2->ptr;
-
- return NULL;
-}
-
-static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
- struct bch_extent_ptr *ptr)
-{
- unsigned target = opts->promote_target ?: opts->foreground_target;
-
- if (target && !bch2_dev_in_target(c, ptr->dev, target))
- return false;
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
-
- return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
-}
-
-void bch2_extent_ptr_set_cached(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s k,
- struct bch_extent_ptr *ptr)
-{
- struct bkey_ptrs ptrs;
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bool have_cached_ptr;
- unsigned drop_dev = ptr->dev;
-
- guard(rcu)();
-restart_drop_ptrs:
- ptrs = bch2_bkey_ptrs(k);
- have_cached_ptr = false;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- /*
- * Check if it's erasure coded - stripes can't contain cached
- * data. Possibly something we can fix in the future?
- */
- if (&entry->ptr == ptr && p.has_ec)
- goto drop;
-
- if (p.ptr.cached) {
- if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
- bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
- ptr = NULL;
- goto restart_drop_ptrs;
- }
-
- have_cached_ptr = true;
- }
- }
-
- if (!ptr)
- bkey_for_each_ptr(ptrs, ptr2)
- if (ptr2->dev == drop_dev)
- ptr = ptr2;
-
- if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
- goto drop;
-
- ptr->cached = true;
- return;
-drop:
- bch2_bkey_drop_ptr_noerror(k, ptr);
-}
-
-/*
- * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
- struct bch_dev *ca;
-
- guard(rcu)();
- bch2_bkey_drop_ptrs(k, ptr,
- ptr->cached &&
- (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
- dev_ptr_stale_rcu(ca, ptr) > 0));
-
- return bkey_deleted(k.k);
-}
-
-/*
- * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
- *
- * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
- * the promote target.
- */
-bool bch2_extent_normalize_by_opts(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s k)
-{
- struct bkey_ptrs ptrs;
- bool have_cached_ptr;
-
- guard(rcu)();
-restart_drop_ptrs:
- ptrs = bch2_bkey_ptrs(k);
- have_cached_ptr = false;
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->cached) {
- if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
- bch2_bkey_drop_ptr(k, ptr);
- goto restart_drop_ptrs;
- }
- have_cached_ptr = true;
- }
-
- return bkey_deleted(k.k);
-}
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
-{
- out->atomic++;
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- if (!ca) {
- prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "");
- } else {
- u32 offset;
- u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
- prt_printf(out, "ptr: %u:%llu:%u gen %u",
- ptr->dev, b, offset, ptr->gen);
- if (ca->mi.durability != 1)
- prt_printf(out, " d=%u", ca->mi.durability);
- if (ptr->cached)
- prt_str(out, " cached");
- if (ptr->unwritten)
- prt_str(out, " unwritten");
- int stale = dev_ptr_stale_rcu(ca, ptr);
- if (stale > 0)
- prt_printf(out, " stale");
- else if (stale)
- prt_printf(out, " invalid");
- }
- --out->atomic;
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
-{
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
- crc->compressed_size,
- crc->uncompressed_size,
- crc->offset, crc->nonce);
- bch2_prt_csum_type(out, crc->csum_type);
- prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
- prt_str(out, " compress ");
- bch2_prt_compression_type(out, crc->compression_type);
-}
-
-static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
- const struct bch_extent_rebalance *r)
-{
- prt_str(out, "rebalance:");
-
- prt_printf(out, " replicas=%u", r->data_replicas);
- if (r->data_replicas_from_inode)
- prt_str(out, " (inode)");
-
- prt_str(out, " checksum=");
- bch2_prt_csum_opt(out, r->data_checksum);
- if (r->data_checksum_from_inode)
- prt_str(out, " (inode)");
-
- if (r->background_compression || r->background_compression_from_inode) {
- prt_str(out, " background_compression=");
- bch2_compression_opt_to_text(out, r->background_compression);
-
- if (r->background_compression_from_inode)
- prt_str(out, " (inode)");
- }
-
- if (r->background_target || r->background_target_from_inode) {
- prt_str(out, " background_target=");
- if (c)
- bch2_target_to_text(out, c, r->background_target);
- else
- prt_printf(out, "%u", r->background_target);
-
- if (r->background_target_from_inode)
- prt_str(out, " (inode)");
- }
-
- if (r->promote_target || r->promote_target_from_inode) {
- prt_str(out, " promote_target=");
- if (c)
- bch2_target_to_text(out, c, r->promote_target);
- else
- prt_printf(out, "%u", r->promote_target);
-
- if (r->promote_target_from_inode)
- prt_str(out, " (inode)");
- }
-
- if (r->erasure_code || r->erasure_code_from_inode) {
- prt_printf(out, " ec=%u", r->erasure_code);
- if (r->erasure_code_from_inode)
- prt_str(out, " (inode)");
- }
-}
-
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- bool first = true;
-
- if (c)
- prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
-
- bkey_extent_entry_for_each(ptrs, entry) {
- if (!first)
- prt_printf(out, " ");
-
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
- break;
-
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128: {
- struct bch_extent_crc_unpacked crc =
- bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
- bch2_extent_crc_unpacked_to_text(out, &crc);
- break;
- }
- case BCH_EXTENT_ENTRY_stripe_ptr: {
- const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
-
- prt_printf(out, "ec: idx %llu block %u",
- (u64) ec->idx, ec->block);
- break;
- }
- case BCH_EXTENT_ENTRY_rebalance:
- bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
- break;
-
- case BCH_EXTENT_ENTRY_flags:
- prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
- break;
-
- default:
- prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
- return;
- }
-
- first = false;
- }
-}
-
-static int extent_ptr_validate(struct bch_fs *c,
- struct bkey_s_c k,
- struct bkey_validate_context from,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata)
-{
- int ret = 0;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr2)
- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
- c, ptr_to_duplicate_device,
- "multiple pointers to same device (%u)", ptr->dev);
-
- /* bad pointers are repaired by check_fix_ptrs(): */
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- if (!ca) {
- rcu_read_unlock();
- return 0;
- }
- u32 bucket_offset;
- u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- unsigned first_bucket = ca->mi.first_bucket;
- u64 nbuckets = ca->mi.nbuckets;
- unsigned bucket_size = ca->mi.bucket_size;
- rcu_read_unlock();
-
- bkey_fsck_err_on(bucket >= nbuckets,
- c, ptr_after_last_bucket,
- "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
- bkey_fsck_err_on(bucket < first_bucket,
- c, ptr_before_first_bucket,
- "pointer before first bucket (%llu < %u)", bucket, first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
- c, ptr_spans_multiple_buckets,
- "pointer spans multiple buckets (%u + %u > %u)",
- bucket_offset, size_ondisk, bucket_size);
-fsck_err:
- return ret;
-}
-
-int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- unsigned size_ondisk = k.k->size;
- unsigned nonce = UINT_MAX;
- unsigned nr_ptrs = 0;
- bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
- int ret = 0;
-
- if (bkey_is_btree_ptr(k.k))
- size_ondisk = btree_sectors(c);
-
- bkey_extent_entry_for_each(ptrs, entry) {
- bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
- c, extent_ptrs_invalid_entry,
- "invalid extent entry type (got %u, max %u)",
- __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-
- bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
- !extent_entry_is_ptr(entry),
- c, btree_ptr_has_non_ptr,
- "has non ptr field");
-
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
- if (ret)
- return ret;
-
- bkey_fsck_err_on(entry->ptr.cached && have_ec,
- c, ptr_cached_and_erasure_coded,
- "cached, erasure coded ptr");
-
- if (!entry->ptr.unwritten)
- have_written = true;
- else
- have_unwritten = true;
-
- have_ec = false;
- crc_since_last_ptr = false;
- nr_ptrs++;
- break;
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
- bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
- c, ptr_crc_csum_type_unknown,
- "invalid checksum type");
- bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
- c, ptr_crc_compression_type_unknown,
- "invalid compression type");
-
- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
- c, ptr_crc_uncompressed_size_too_small,
- "checksum offset + key size > uncompressed size");
- bkey_fsck_err_on(crc_is_encoded(crc) &&
- (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
- c, ptr_crc_uncompressed_size_too_big,
- "too large encoded extent");
- bkey_fsck_err_on(!crc_is_compressed(crc) &&
- crc.compressed_size != crc.uncompressed_size,
- c, ptr_crc_uncompressed_size_mismatch,
- "not compressed but compressed != uncompressed size");
-
- if (bch2_csum_type_is_encryption(crc.csum_type)) {
- if (nonce == UINT_MAX)
- nonce = crc.offset + crc.nonce;
- else if (nonce != crc.offset + crc.nonce)
- bkey_fsck_err(c, ptr_crc_nonce_mismatch,
- "incorrect nonce");
- }
-
- bkey_fsck_err_on(crc_since_last_ptr,
- c, ptr_crc_redundant,
- "redundant crc entry");
- crc_since_last_ptr = true;
-
- size_ondisk = crc.compressed_size;
- break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- bkey_fsck_err_on(have_ec,
- c, ptr_stripe_redundant,
- "redundant stripe entry");
- have_ec = true;
- break;
- case BCH_EXTENT_ENTRY_rebalance: {
- /*
- * this shouldn't be a fsck error, for forward
- * compatibility; the rebalance code should just refetch
- * the compression opt if it's unknown
- */
-#if 0
- const struct bch_extent_rebalance *r = &entry->rebalance;
-
- if (!bch2_compression_opt_valid(r->compression)) {
- struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
- prt_printf(err, "invalid compression opt %u:%u",
- opt.type, opt.level);
- return bch_err_throw(c, invalid_bkey);
- }
-#endif
- break;
- }
- case BCH_EXTENT_ENTRY_flags:
- bkey_fsck_err_on(entry != ptrs.start,
- c, extent_flags_not_at_start,
- "extent flags entry not at start");
- break;
- }
- }
-
- bkey_fsck_err_on(!nr_ptrs,
- c, extent_ptrs_no_ptrs,
- "no ptrs");
- bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
- c, extent_ptrs_too_many_ptrs,
- "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
- bkey_fsck_err_on(have_written && have_unwritten,
- c, extent_ptrs_written_and_unwritten,
- "extent with unwritten and written ptrs");
- bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
- c, extent_ptrs_unwritten,
- "has unwritten ptrs");
- bkey_fsck_err_on(crc_since_last_ptr,
- c, extent_ptrs_redundant_crc,
- "redundant crc entry");
- bkey_fsck_err_on(have_ec,
- c, extent_ptrs_redundant_stripe,
- "redundant stripe entry");
-fsck_err:
- return ret;
-}
-
-void bch2_ptr_swab(struct bkey_s k)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- u64 *d;
-
- for (d = (u64 *) ptrs.start;
- d != (u64 *) ptrs.end;
- d++)
- *d = swab64(*d);
-
- for (entry = ptrs.start;
- entry < ptrs.end;
- entry = extent_entry_next(entry)) {
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- break;
- case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.csum = swab32(entry->crc32.csum);
- break;
- case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
- entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
- break;
- case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = (__force __le64)
- swab64((__force u64) entry->crc128.csum.hi);
- entry->crc128.csum.lo = (__force __le64)
- swab64((__force u64) entry->crc128.csum.lo);
- break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- break;
- case BCH_EXTENT_ENTRY_rebalance:
- break;
- default:
- /* Bad entry type: will be caught by validate() */
- return;
- }
- }
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
-{
- int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
- if (ret)
- return ret;
-
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
- if (ptrs.start != ptrs.end &&
- extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
- ptrs.start->flags.flags = flags;
- } else {
- struct bch_extent_flags f = {
- .type = BIT(BCH_EXTENT_ENTRY_flags),
- .flags = flags,
- };
- __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
- }
-
- return 0;
-}
-
-/* Generic extent code: */
-
-int bch2_cut_front_s(struct bpos where, struct bkey_s k)
-{
- unsigned new_val_u64s = bkey_val_u64s(k.k);
- int val_u64s_delta;
- u64 sub;
-
- if (bkey_le(where, bkey_start_pos(k.k)))
- return 0;
-
- EBUG_ON(bkey_gt(where, k.k->p));
-
- sub = where.offset - bkey_start_offset(k.k);
-
- k.k->size -= sub;
-
- if (!k.k->size) {
- k.k->type = KEY_TYPE_deleted;
- new_val_u64s = 0;
- }
-
- switch (k.k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v: {
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- bool seen_crc = false;
-
- bkey_extent_entry_for_each(ptrs, entry) {
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- if (!seen_crc)
- entry->ptr.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.offset += sub;
- break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- case BCH_EXTENT_ENTRY_rebalance:
- case BCH_EXTENT_ENTRY_flags:
- break;
- }
-
- if (extent_entry_is_crc(entry))
- seen_crc = true;
- }
-
- break;
- }
- case KEY_TYPE_reflink_p: {
- struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
- SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
- break;
- }
- case KEY_TYPE_inline_data:
- case KEY_TYPE_indirect_inline_data: {
- void *p = bkey_inline_data_p(k);
- unsigned bytes = bkey_inline_data_bytes(k.k);
-
- sub = min_t(u64, sub << 9, bytes);
-
- memmove(p, p + sub, bytes - sub);
-
- new_val_u64s -= sub >> 3;
- break;
- }
- }
-
- val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
- BUG_ON(val_u64s_delta < 0);
-
- set_bkey_val_u64s(k.k, new_val_u64s);
- memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
- return -val_u64s_delta;
-}
-
-int bch2_cut_back_s(struct bpos where, struct bkey_s k)
-{
- unsigned new_val_u64s = bkey_val_u64s(k.k);
- int val_u64s_delta;
- u64 len = 0;
-
- if (bkey_ge(where, k.k->p))
- return 0;
-
- EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
-
- len = where.offset - bkey_start_offset(k.k);
-
- k.k->p.offset = where.offset;
- k.k->size = len;
-
- if (!len) {
- k.k->type = KEY_TYPE_deleted;
- new_val_u64s = 0;
- }
-
- switch (k.k->type) {
- case KEY_TYPE_inline_data:
- case KEY_TYPE_indirect_inline_data:
- new_val_u64s = (bkey_inline_data_offset(k.k) +
- min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
- break;
- }
-
- val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
- BUG_ON(val_u64s_delta < 0);
-
- set_bkey_val_u64s(k.k, new_val_u64s);
- memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
- return -val_u64s_delta;
-}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
deleted file mode 100644
index b8590e51b76e..000000000000
--- a/fs/bcachefs/extents.h
+++ /dev/null
@@ -1,768 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_H
-#define _BCACHEFS_EXTENTS_H
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "extents_types.h"
-
-struct bch_fs;
-struct btree_trans;
-
-/* extent entries: */
-
-#define extent_entry_last(_e) \
- ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
-
-#define entry_to_ptr(_entry) \
-({ \
- EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
- \
- __builtin_choose_expr( \
- type_is_exact(_entry, const union bch_extent_entry *), \
- (const struct bch_extent_ptr *) (_entry), \
- (struct bch_extent_ptr *) (_entry)); \
-})
-
-/* downcast, preserves const */
-#define to_entry(_entry) \
-({ \
- BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
- !type_is(_entry, struct bch_extent_ptr *) && \
- !type_is(_entry, struct bch_extent_stripe_ptr *)); \
- \
- __builtin_choose_expr( \
- (type_is_exact(_entry, const union bch_extent_crc *) || \
- type_is_exact(_entry, const struct bch_extent_ptr *) ||\
- type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
- (const union bch_extent_entry *) (_entry), \
- (union bch_extent_entry *) (_entry)); \
-})
-
-#define extent_entry_next(_entry) \
- ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
-
-#define extent_entry_next_safe(_entry, _end) \
- (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
- ? extent_entry_next(_entry) \
- : _end)
-
-static inline unsigned
-__extent_entry_type(const union bch_extent_entry *e)
-{
- return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
-}
-
-static inline enum bch_extent_entry_type
-extent_entry_type(const union bch_extent_entry *e)
-{
- int ret = __ffs(e->type);
-
- EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-
- return ret;
-}
-
-static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
-{
- switch (extent_entry_type(entry)) {
-#define x(f, n) \
- case BCH_EXTENT_ENTRY_##f: \
- return sizeof(struct bch_extent_##f);
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
-{
- return extent_entry_bytes(entry) / sizeof(u64);
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
- union bch_extent_entry *dst,
- union bch_extent_entry *new)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
- memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
- dst, (u64 *) end - (u64 *) dst);
- k->k.u64s += extent_entry_u64s(new);
- memcpy_u64s_small(dst, new, extent_entry_u64s(new));
-}
-
-static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *next = extent_entry_next(entry);
-
- /* stripes have ptrs, but their layout doesn't work with this code */
- BUG_ON(k.k->type == KEY_TYPE_stripe);
-
- memmove_u64s_down(entry, next,
- (u64 *) bkey_val_end(k) - (u64 *) next);
- k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
-static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
-{
- return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-}
-
-static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
-{
- return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
-}
-
-static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
-{
- switch (__extent_entry_type(e)) {
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- return true;
- default:
- return false;
- }
-}
-
-union bch_extent_crc {
- u8 type;
- struct bch_extent_crc32 crc32;
- struct bch_extent_crc64 crc64;
- struct bch_extent_crc128 crc128;
-};
-
-#define __entry_to_crc(_entry) \
- __builtin_choose_expr( \
- type_is_exact(_entry, const union bch_extent_entry *), \
- (const union bch_extent_crc *) (_entry), \
- (union bch_extent_crc *) (_entry))
-
-#define entry_to_crc(_entry) \
-({ \
- EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
- \
- __entry_to_crc(_entry); \
-})
-
-static inline struct bch_extent_crc_unpacked
-bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
-{
-#define common_fields(_crc) \
- .csum_type = _crc.csum_type, \
- .compression_type = _crc.compression_type, \
- .compressed_size = _crc._compressed_size + 1, \
- .uncompressed_size = _crc._uncompressed_size + 1, \
- .offset = _crc.offset, \
- .live_size = k->size
-
- if (!crc)
- return (struct bch_extent_crc_unpacked) {
- .compressed_size = k->size,
- .uncompressed_size = k->size,
- .live_size = k->size,
- };
-
- switch (extent_entry_type(to_entry(crc))) {
- case BCH_EXTENT_ENTRY_crc32: {
- struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
- common_fields(crc->crc32),
- };
-
- *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
- return ret;
- }
- case BCH_EXTENT_ENTRY_crc64: {
- struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
- common_fields(crc->crc64),
- .nonce = crc->crc64.nonce,
- .csum.lo = (__force __le64) crc->crc64.csum_lo,
- };
-
- *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
-
- return ret;
- }
- case BCH_EXTENT_ENTRY_crc128: {
- struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
- common_fields(crc->crc128),
- .nonce = crc->crc128.nonce,
- .csum = crc->crc128.csum,
- };
-
- return ret;
- }
- default:
- BUG();
- }
-#undef common_fields
-}
-
-static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
-{
- return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
- crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
-}
-
-static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
-{
- return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
-}
-
-void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
-
-/* bkey_ptrs: generically over any key type that has ptrs */
-
-struct bkey_ptrs_c {
- const union bch_extent_entry *start;
- const union bch_extent_entry *end;
-};
-
-struct bkey_ptrs {
- union bch_extent_entry *start;
- union bch_extent_entry *end;
-};
-
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr: {
- struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-
- return (struct bkey_ptrs_c) {
- to_entry(&e.v->start[0]),
- to_entry(extent_entry_last(e))
- };
- }
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
- return (struct bkey_ptrs_c) {
- e.v->start,
- extent_entry_last(e)
- };
- }
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
- return (struct bkey_ptrs_c) {
- to_entry(&s.v->ptrs[0]),
- to_entry(&s.v->ptrs[s.v->nr_blocks]),
- };
- }
- case KEY_TYPE_reflink_v: {
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- return (struct bkey_ptrs_c) {
- r.v->start,
- bkey_val_end(r),
- };
- }
- case KEY_TYPE_btree_ptr_v2: {
- struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
-
- return (struct bkey_ptrs_c) {
- to_entry(&e.v->start[0]),
- to_entry(extent_entry_last(e))
- };
- }
- default:
- return (struct bkey_ptrs_c) { NULL, NULL };
- }
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
- return (struct bkey_ptrs) {
- (void *) p.start,
- (void *) p.end
- };
-}
-
-#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
- for ((_entry) = (_start); \
- (_entry) < (_end); \
- (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define __bkey_ptr_next(_ptr, _end) \
-({ \
- typeof(_end) _entry; \
- \
- __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \
- if (extent_entry_is_ptr(_entry)) \
- break; \
- \
- _entry < (_end) ? entry_to_ptr(_entry) : NULL; \
-})
-
-#define bkey_extent_entry_for_each_from(_p, _entry, _start) \
- __bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
-
-#define bkey_extent_entry_for_each(_p, _entry) \
- bkey_extent_entry_for_each_from(_p, _entry, _p.start)
-
-#define __bkey_for_each_ptr(_start, _end, _ptr) \
- for (typeof(_start) (_ptr) = (_start); \
- ((_ptr) = __bkey_ptr_next(_ptr, _end)); \
- (_ptr)++)
-
-#define bkey_ptr_next(_p, _ptr) \
- __bkey_ptr_next(_ptr, (_p).end)
-
-#define bkey_for_each_ptr(_p, _ptr) \
- __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
-
-#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \
-({ \
- __label__ out; \
- \
- (_ptr).has_ec = false; \
- (_ptr).do_ec_reconstruct = false; \
- (_ptr).crc_retry_nr = 0; \
- \
- __bkey_extent_entry_for_each_from(_entry, _end, _entry) \
- switch (__extent_entry_type(_entry)) { \
- case BCH_EXTENT_ENTRY_ptr: \
- (_ptr).ptr = _entry->ptr; \
- goto out; \
- case BCH_EXTENT_ENTRY_crc32: \
- case BCH_EXTENT_ENTRY_crc64: \
- case BCH_EXTENT_ENTRY_crc128: \
- (_ptr).crc = bch2_extent_crc_unpack(_k, \
- entry_to_crc(_entry)); \
- break; \
- case BCH_EXTENT_ENTRY_stripe_ptr: \
- (_ptr).ec = _entry->stripe_ptr; \
- (_ptr).has_ec = true; \
- break; \
- default: \
- /* nothing */ \
- break; \
- } \
-out: \
- _entry < (_end); \
-})
-
-#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \
- for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
- (_entry) = _start; \
- __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
- (_entry) = extent_entry_next_safe(_entry, _end))
-
-#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
- __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
- _ptr, _entry)
-
-#define bkey_crc_next(_k, _end, _crc, _iter) \
-({ \
- __bkey_extent_entry_for_each_from(_iter, _end, _iter) \
- if (extent_entry_is_crc(_iter)) { \
- (_crc) = bch2_extent_crc_unpack(_k, \
- entry_to_crc(_iter)); \
- break; \
- } \
- \
- (_iter) < (_end); \
-})
-
-#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
- for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
- (_iter) = (_start); \
- bkey_crc_next(_k, _end, _crc, _iter); \
- (_iter) = extent_entry_next(_iter))
-
-#define bkey_for_each_crc(_k, _p, _crc, _iter) \
- __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
-
-/* Iterate over pointers in KEY_TYPE_extent: */
-
-#define extent_ptr_next(_e, _ptr) \
- __bkey_ptr_next(_ptr, extent_entry_last(_e))
-
-#define extent_for_each_ptr(_e, _ptr) \
- __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-
-#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
- __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
- extent_entry_last(_e), _ptr, _entry)
-
-/* utility code common to all keys with pointers: */
-
-void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
- struct bch_io_failures *);
-struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
- unsigned);
-void bch2_mark_io_failure(struct bch_io_failures *,
- struct extent_ptr_decoded *, bool);
-void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
-int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
- struct bch_io_failures *,
- struct extent_ptr_decoded *, int);
-
-/* KEY_TYPE_btree_ptr: */
-
-int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-
-int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
- int, struct bkey_s);
-
-#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \
- .key_validate = bch2_btree_ptr_validate, \
- .val_to_text = bch2_btree_ptr_to_text, \
- .swab = bch2_ptr_swab, \
- .trigger = bch2_trigger_extent, \
-})
-
-#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
- .key_validate = bch2_btree_ptr_v2_validate, \
- .val_to_text = bch2_btree_ptr_v2_to_text, \
- .swab = bch2_ptr_swab, \
- .compat = bch2_btree_ptr_v2_compat, \
- .trigger = bch2_trigger_extent, \
- .min_val_size = 40, \
-})
-
-/* KEY_TYPE_extent: */
-
-bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_extent ((struct bkey_ops) { \
- .key_validate = bch2_bkey_ptrs_validate, \
- .val_to_text = bch2_bkey_ptrs_to_text, \
- .swab = bch2_ptr_swab, \
- .key_normalize = bch2_extent_normalize, \
- .key_merge = bch2_extent_merge, \
- .trigger = bch2_trigger_extent, \
-})
-
-/* KEY_TYPE_reservation: */
-
-int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-
-#define bch2_bkey_ops_reservation ((struct bkey_ops) { \
- .key_validate = bch2_reservation_validate, \
- .val_to_text = bch2_reservation_to_text, \
- .key_merge = bch2_reservation_merge, \
- .trigger = bch2_trigger_reservation, \
- .min_val_size = 8, \
-})
-
-/* Extent checksum entries: */
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
- struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
-void bch2_extent_crc_append(struct bkey_i *,
- struct bch_extent_crc_unpacked);
-
-/* Generic code for keys with pointers: */
-
-static inline bool bkey_is_btree_ptr(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_direct_data(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_inline_data(const struct bkey *k)
-{
- return k->type == KEY_TYPE_inline_data ||
- k->type == KEY_TYPE_indirect_inline_data;
-}
-
-static inline unsigned bkey_inline_data_offset(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_inline_data:
- return sizeof(struct bch_inline_data);
- case KEY_TYPE_indirect_inline_data:
- return sizeof(struct bch_indirect_inline_data);
- default:
- BUG();
- }
-}
-
-static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
-{
- return bkey_val_bytes(k) - bkey_inline_data_offset(k);
-}
-
-#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k))
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
- return bkey_extent_is_direct_data(k) ||
- bkey_extent_is_inline_data(k) ||
- k->type == KEY_TYPE_reflink_p;
-}
-
-/*
- * Should extent be counted under inode->i_sectors?
- */
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reservation:
- case KEY_TYPE_reflink_p:
- case KEY_TYPE_reflink_v:
- case KEY_TYPE_inline_data:
- case KEY_TYPE_indirect_inline_data:
- case KEY_TYPE_error:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->unwritten)
- return true;
- return false;
-}
-
-static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
-{
- return k.k->type == KEY_TYPE_reservation ||
- bkey_extent_is_unwritten(k);
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(p, ptr)
- ret.data[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(p, ptr)
- if (!ptr->cached)
- ret.data[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-
- bkey_for_each_ptr(p, ptr)
- if (ptr->cached)
- ret.data[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
-bool bch2_bkey_is_incompressible(struct bkey_s_c);
-unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-
-unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-
-const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
-
-static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
-{
- return (void *) bch2_bkey_has_device_c(k.s_c, dev);
-}
-
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-
-static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
-{
- struct bch_extent_ptr *dest;
-
- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
-
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
- *dest = ptr;
- k->k.u64s++;
- break;
- default:
- BUG();
- }
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
- struct extent_ptr_decoded *);
-void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
-void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
-#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
-do { \
- __label__ _again; \
- struct bkey_ptrs _ptrs; \
-_again: \
- _ptrs = bch2_bkey_ptrs(_k); \
- \
- bkey_for_each_ptr(_ptrs, _ptr) \
- if (_cond) { \
- bch2_bkey_drop_ptr_noerror(_k, _ptr); \
- goto _again; \
- } \
-} while (0)
-
-#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
-do { \
- __label__ _again; \
- struct bkey_ptrs _ptrs; \
-_again: \
- _ptrs = bch2_bkey_ptrs(_k); \
- \
- bkey_for_each_ptr(_ptrs, _ptr) \
- if (_cond) { \
- bch2_bkey_drop_ptr(_k, _ptr); \
- goto _again; \
- } \
-} while (0)
-
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
- struct bch_extent_ptr, u64);
-bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-struct bch_extent_ptr *
-bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-
-void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
- struct bkey_s, struct bch_extent_ptr *);
-
-bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-
-void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-
-static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
- struct bch_extent_ptr ptr2)
-{
- return (ptr1.cached == ptr2.cached &&
- ptr1.unwritten == ptr2.unwritten &&
- ptr1.offset == ptr2.offset &&
- ptr1.dev == ptr2.dev &&
- ptr1.gen == ptr2.gen);
-}
-
-void bch2_ptr_swab(struct bkey_s);
-
-/* Generic extent code: */
-
-enum bch_extent_overlap {
- BCH_EXTENT_OVERLAP_ALL = 0,
- BCH_EXTENT_OVERLAP_BACK = 1,
- BCH_EXTENT_OVERLAP_FRONT = 2,
- BCH_EXTENT_OVERLAP_MIDDLE = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
- const struct bkey *m)
-{
- int cmp1 = bkey_lt(k->p, m->p);
- int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
-
- return (cmp1 << 1) + cmp2;
-}
-
-int bch2_cut_front_s(struct bpos, struct bkey_s);
-int bch2_cut_back_s(struct bpos, struct bkey_s);
-
-static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
- bch2_cut_front_s(where, bkey_i_to_s(k));
-}
-
-static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
-{
- bch2_cut_back_s(where, bkey_i_to_s(k));
-}
-
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
-{
- k->p.offset -= k->size;
- k->p.offset += new_size;
- k->size = new_size;
-}
-
-static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
-{
- if (ptrs.start != ptrs.end &&
- extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
- return ptrs.start->flags.flags;
- return 0;
-}
-
-static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
-{
- return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
-}
-
-int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
-
-#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
deleted file mode 100644
index 74c0252cbd98..000000000000
--- a/fs/bcachefs/extents_format.h
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_FORMAT_H
-#define _BCACHEFS_EXTENTS_FORMAT_H
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5) \
- x(flags, 6)
-#define BCH_EXTENT_ENTRY_MAX 7
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-#define BCH_EXTENT_FLAGS() \
- x(poisoned, 0)
-
-enum bch_extent_flags_e {
-#define x(n, v) BCH_EXTENT_FLAG_##n = v,
- BCH_EXTENT_FLAGS()
-#undef x
-};
-
-struct bch_extent_flags {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:7,
- flags:57;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 flags:57,
- type:7;
-#endif
-};
-
-/* bch_extent_rebalance: */
-#include "rebalance_format.h"
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
deleted file mode 100644
index b23ce4a373c0..000000000000
--- a/fs/bcachefs/extents_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_EXTENTS_TYPES_H
-#define _BCACHEFS_EXTENTS_TYPES_H
-
-#include "bcachefs_format.h"
-
-struct bch_extent_crc_unpacked {
- u32 compressed_size;
- u32 uncompressed_size;
- u32 live_size;
-
- u8 csum_type;
- u8 compression_type;
-
- u16 offset;
-
- u16 nonce;
-
- struct bch_csum csum;
-};
-
-struct extent_ptr_decoded {
- bool has_ec;
- bool do_ec_reconstruct;
- u8 crc_retry_nr;
- struct bch_extent_crc_unpacked crc;
- struct bch_extent_ptr ptr;
- struct bch_extent_stripe_ptr ec;
-};
-
-struct bch_io_failures {
- u8 nr;
- struct bch_dev_io_failures {
- u8 dev;
- unsigned failed_csum_nr:6,
- failed_io:1,
- failed_btree_validate:1,
- failed_ec:1;
- } devs[BCH_REPLICAS_MAX + 1];
-};
-
-#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
deleted file mode 100644
index 0e742555cb0a..000000000000
--- a/fs/bcachefs/eytzinger.c
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "eytzinger.h"
-
-/**
- * is_aligned - is this pointer & size okay for word-wide copying?
- * @base: pointer to data
- * @size: size of each element
- * @align: required alignment (typically 4 or 8)
- *
- * Returns true if elements can be copied using word loads and stores.
- * The size must be a multiple of the alignment, and the base address must
- * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
- *
- * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
- * to "if ((a | b) & mask)", so we do that by hand.
- */
-__attribute_const__ __always_inline
-static bool is_aligned(const void *base, size_t size, unsigned char align)
-{
- unsigned char lsbits = (unsigned char)size;
-
- (void)base;
-#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- lsbits |= (unsigned char)(uintptr_t)base;
-#endif
- return (lsbits & (align - 1)) == 0;
-}
-
-/**
- * swap_words_32 - swap two elements in 32-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 4)
- *
- * Exchange the two objects in memory. This exploits base+index addressing,
- * which basically all CPUs have, to minimize loop overhead computations.
- *
- * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
- * bottom of the loop, even though the zero flag is still valid from the
- * subtract (since the intervening mov instructions don't alter the flags).
- * Gcc 8.1.0 doesn't have that problem.
- */
-static void swap_words_32(void *a, void *b, size_t n)
-{
- do {
- u32 t = *(u32 *)(a + (n -= 4));
- *(u32 *)(a + n) = *(u32 *)(b + n);
- *(u32 *)(b + n) = t;
- } while (n);
-}
-
-/**
- * swap_words_64 - swap two elements in 64-bit chunks
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size (must be a multiple of 8)
- *
- * Exchange the two objects in memory. This exploits base+index
- * addressing, which basically all CPUs have, to minimize loop overhead
- * computations.
- *
- * We'd like to use 64-bit loads if possible. If they're not, emulating
- * one requires base+index+4 addressing which x86 has but most other
- * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
- * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
- * x32 ABI). Are there any cases the kernel needs to worry about?
- */
-static void swap_words_64(void *a, void *b, size_t n)
-{
- do {
-#ifdef CONFIG_64BIT
- u64 t = *(u64 *)(a + (n -= 8));
- *(u64 *)(a + n) = *(u64 *)(b + n);
- *(u64 *)(b + n) = t;
-#else
- /* Use two 32-bit transfers to avoid base+index+4 addressing */
- u32 t = *(u32 *)(a + (n -= 4));
- *(u32 *)(a + n) = *(u32 *)(b + n);
- *(u32 *)(b + n) = t;
-
- t = *(u32 *)(a + (n -= 4));
- *(u32 *)(a + n) = *(u32 *)(b + n);
- *(u32 *)(b + n) = t;
-#endif
- } while (n);
-}
-
-/**
- * swap_bytes - swap two elements a byte at a time
- * @a: pointer to the first element to swap
- * @b: pointer to the second element to swap
- * @n: element size
- *
- * This is the fallback if alignment doesn't allow using larger chunks.
- */
-static void swap_bytes(void *a, void *b, size_t n)
-{
- do {
- char t = ((char *)a)[--n];
- ((char *)a)[n] = ((char *)b)[n];
- ((char *)b)[n] = t;
- } while (n);
-}
-
-/*
- * The values are arbitrary as long as they can't be confused with
- * a pointer, but small integers make for the smallest compare
- * instructions.
- */
-#define SWAP_WORDS_64 (swap_r_func_t)0
-#define SWAP_WORDS_32 (swap_r_func_t)1
-#define SWAP_BYTES (swap_r_func_t)2
-#define SWAP_WRAPPER (swap_r_func_t)3
-
-struct wrapper {
- cmp_func_t cmp;
- swap_func_t swap_func;
-};
-
-/*
- * The function pointer is last to make tail calls most efficient if the
- * compiler decides not to inline this function.
- */
-static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
-{
- if (swap_func == SWAP_WRAPPER) {
- ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
- return;
- }
-
- if (swap_func == SWAP_WORDS_64)
- swap_words_64(a, b, size);
- else if (swap_func == SWAP_WORDS_32)
- swap_words_32(a, b, size);
- else if (swap_func == SWAP_BYTES)
- swap_bytes(a, b, size);
- else
- swap_func(a, b, (int)size, priv);
-}
-
-#define _CMP_WRAPPER ((cmp_r_func_t)0L)
-
-static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
-{
- if (cmp == _CMP_WRAPPER)
- return ((const struct wrapper *)priv)->cmp(a, b);
- return cmp(a, b, priv);
-}
-
-static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
- cmp_r_func_t cmp_func, const void *priv,
- size_t l, size_t r)
-{
- return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
- base1 + inorder_to_eytzinger1(r, n) * size,
- cmp_func, priv);
-}
-
-static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
- swap_r_func_t swap_func, const void *priv,
- size_t l, size_t r)
-{
- do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
- base1 + inorder_to_eytzinger1(r, n) * size,
- size, swap_func, priv);
-}
-
-static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
-{
- unsigned i, j, k;
-
- /* called from 'sort' without swap function, let's pick the default */
- if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
- swap_func = NULL;
-
- if (!swap_func) {
- if (is_aligned(base1, size, 8))
- swap_func = SWAP_WORDS_64;
- else if (is_aligned(base1, size, 4))
- swap_func = SWAP_WORDS_32;
- else
- swap_func = SWAP_BYTES;
- }
-
- /* heapify */
- for (i = n / 2; i >= 1; --i) {
- /* Find the sift-down path all the way to the leaves. */
- for (j = i; k = j * 2, k < n;)
- j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
- /* Special case for the last leaf with no sibling. */
- if (j * 2 == n)
- j *= 2;
-
- /* Backtrack to the correct location. */
- while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
- j /= 2;
-
- /* Shift the element into its correct place. */
- for (k = j; j != i;) {
- j /= 2;
- eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
- }
- }
-
- /* sort */
- for (i = n; i > 1; --i) {
- eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
-
- /* Find the sift-down path all the way to the leaves. */
- for (j = 1; k = j * 2, k + 1 < i;)
- j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
-
- /* Special case for the last leaf with no sibling. */
- if (j * 2 + 1 == i)
- j *= 2;
-
- /* Backtrack to the correct location. */
- while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
- j /= 2;
-
- /* Shift the element into its correct place. */
- for (k = j; j > 1;) {
- j /= 2;
- eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
- }
- }
-}
-
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
-{
- void *base1 = base - size;
-
- return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- cmp_func_t cmp_func,
- swap_func_t swap_func)
-{
- struct wrapper w = {
- .cmp = cmp_func,
- .swap_func = swap_func,
- };
-
- return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
-}
-
-#if 0
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/ktime.h>
-
-static u64 cmp_count;
-
-static int mycmp(const void *a, const void *b)
-{
- u32 _a = *(u32 *)a;
- u32 _b = *(u32 *)b;
-
- cmp_count++;
- if (_a < _b)
- return -1;
- else if (_a > _b)
- return 1;
- else
- return 0;
-}
-
-static int test(void)
-{
- size_t N, i;
- ktime_t start, end;
- s64 delta;
- u32 *arr;
-
- for (N = 10000; N <= 100000; N += 10000) {
- arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL);
- cmp_count = 0;
-
- for (i = 0; i < N; i++)
- arr[i] = get_random_u32();
-
- start = ktime_get();
- eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL);
- end = ktime_get();
-
- delta = ktime_us_delta(end, start);
- printk(KERN_INFO "time: %lld\n", delta);
- printk(KERN_INFO "comparisons: %lld\n", cmp_count);
-
- u32 prev = 0;
-
- eytzinger0_for_each(i, N) {
- if (prev > arr[i])
- goto err;
- prev = arr[i];
- }
-
- kfree(arr);
- }
- return 0;
-
-err:
- kfree(arr);
- return -1;
-}
-#endif
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
deleted file mode 100644
index 643c1f716061..000000000000
--- a/fs/bcachefs/eytzinger.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
-
-#include <linux/bitops.h>
-#include <linux/log2.h>
-
-#ifdef EYTZINGER_DEBUG
-#include <linux/bug.h>
-#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
-#else
-#define EYTZINGER_BUG_ON(cond)
-#endif
-
-/*
- * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array.
- *
- * Consider using an eytzinger tree any time you would otherwise be doing binary
- * search over an array. Binary search is a worst case scenario for branch
- * prediction and prefetching, but in an eytzinger tree every node's children
- * are adjacent in memory, thus we can prefetch children before knowing the
- * result of the comparison, assuming multiple nodes fit on a cacheline.
- *
- * Two variants are provided, for one based indexing and zero based indexing.
- *
- * Zero based indexing is more convenient, but one based indexing has better
- * alignment and thus better performance because each new level of the tree
- * starts at a power of two, and thus if element 0 was cacheline aligned, each
- * new level will be as well.
- */
-
-static inline unsigned eytzinger1_child(unsigned i, unsigned child)
-{
- EYTZINGER_BUG_ON(child > 1);
-
- return (i << 1) + child;
-}
-
-static inline unsigned eytzinger1_left_child(unsigned i)
-{
- return eytzinger1_child(i, 0);
-}
-
-static inline unsigned eytzinger1_right_child(unsigned i)
-{
- return eytzinger1_child(i, 1);
-}
-
-static inline unsigned eytzinger1_first(unsigned size)
-{
- return size ? rounddown_pow_of_two(size) : 0;
-}
-
-static inline unsigned eytzinger1_last(unsigned size)
-{
- return rounddown_pow_of_two(size + 1) - 1;
-}
-
-static inline unsigned eytzinger1_next(unsigned i, unsigned size)
-{
- EYTZINGER_BUG_ON(i == 0 || i > size);
-
- if (eytzinger1_right_child(i) <= size) {
- i = eytzinger1_right_child(i);
-
- i <<= __fls(size) - __fls(i);
- i >>= i > size;
- } else {
- i >>= ffz(i) + 1;
- }
-
- return i;
-}
-
-static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
-{
- EYTZINGER_BUG_ON(i == 0 || i > size);
-
- if (eytzinger1_left_child(i) <= size) {
- i = eytzinger1_left_child(i) + 1;
-
- i <<= __fls(size) - __fls(i);
- i -= 1;
- i >>= i > size;
- } else {
- i >>= __ffs(i) + 1;
- }
-
- return i;
-}
-
-static inline unsigned eytzinger1_extra(unsigned size)
-{
- return size
- ? (size + 1 - rounddown_pow_of_two(size)) << 1
- : 0;
-}
-
-static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
- unsigned extra)
-{
- unsigned b = __fls(i);
- unsigned shift = __fls(size) - b;
- int s;
-
- EYTZINGER_BUG_ON(!i || i > size);
-
- i ^= 1U << b;
- i <<= 1;
- i |= 1;
- i <<= shift;
-
- /*
- * sign bit trick:
- *
- * if (i > extra)
- * i -= (i - extra) >> 1;
- */
- s = extra - i;
- i += (s >> 1) & (s >> 31);
-
- return i;
-}
-
-static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
- unsigned extra)
-{
- unsigned shift;
- int s;
-
- EYTZINGER_BUG_ON(!i || i > size);
-
- /*
- * sign bit trick:
- *
- * if (i > extra)
- * i += i - extra;
- */
- s = extra - i;
- i -= s & (s >> 31);
-
- shift = __ffs(i);
-
- i >>= shift + 1;
- i |= 1U << (__fls(size) - shift);
-
- return i;
-}
-
-static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
-{
- return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
-{
- return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
-}
-
-#define eytzinger1_for_each(_i, _size) \
- for (unsigned (_i) = eytzinger1_first((_size)); \
- (_i) != 0; \
- (_i) = eytzinger1_next((_i), (_size)))
-
-/* Zero based indexing version: */
-
-static inline unsigned eytzinger0_child(unsigned i, unsigned child)
-{
- EYTZINGER_BUG_ON(child > 1);
-
- return (i << 1) + 1 + child;
-}
-
-static inline unsigned eytzinger0_left_child(unsigned i)
-{
- return eytzinger0_child(i, 0);
-}
-
-static inline unsigned eytzinger0_right_child(unsigned i)
-{
- return eytzinger0_child(i, 1);
-}
-
-static inline unsigned eytzinger0_first(unsigned size)
-{
- return eytzinger1_first(size) - 1;
-}
-
-static inline unsigned eytzinger0_last(unsigned size)
-{
- return eytzinger1_last(size) - 1;
-}
-
-static inline unsigned eytzinger0_next(unsigned i, unsigned size)
-{
- return eytzinger1_next(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
-{
- return eytzinger1_prev(i + 1, size) - 1;
-}
-
-static inline unsigned eytzinger0_extra(unsigned size)
-{
- return eytzinger1_extra(size);
-}
-
-static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
- unsigned extra)
-{
- return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
-}
-
-static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
- unsigned extra)
-{
- return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
-}
-
-static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
-{
- return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
-{
- return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
-}
-
-#define eytzinger0_for_each(_i, _size) \
- for (unsigned (_i) = eytzinger0_first((_size)); \
- (_i) != -1; \
- (_i) = eytzinger0_next((_i), (_size)))
-
-#define eytzinger0_for_each_prev(_i, _size) \
- for (unsigned (_i) = eytzinger0_last((_size)); \
- (_i) != -1; \
- (_i) = eytzinger0_prev((_i), (_size)))
-
-/* return greatest node <= @search, or -1 if not found */
-static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
-{
- void *base1 = base - size;
- unsigned n = 1;
-
- while (n <= nr)
- n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
- n >>= __ffs(n) + 1;
- return n - 1;
-}
-
-/* return smallest node > @search, or -1 if not found */
-static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
-{
- void *base1 = base - size;
- unsigned n = 1;
-
- while (n <= nr)
- n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
- n >>= __ffs(n + 1) + 1;
- return n - 1;
-}
-
-/* return smallest node >= @search, or -1 if not found */
-static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
-{
- void *base1 = base - size;
- unsigned n = 1;
-
- while (n <= nr)
- n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
- n >>= __ffs(n + 1) + 1;
- return n - 1;
-}
-
-#define eytzinger0_find(base, nr, size, _cmp, search) \
-({ \
- size_t _size = (size); \
- void *_base1 = (void *)(base) - _size; \
- const void *_search = (search); \
- size_t _nr = (nr); \
- size_t _i = 1; \
- int _res; \
- \
- while (_i <= _nr && \
- (_res = _cmp(_search, _base1 + _i * _size))) \
- _i = eytzinger1_child(_i, _res > 0); \
- _i - 1; \
-})
-
-void eytzinger0_sort_r(void *, size_t, size_t,
- cmp_r_func_t, swap_r_func_t, const void *);
-void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
-
-#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
deleted file mode 100644
index 2faec143eb31..000000000000
--- a/fs/bcachefs/fast_list.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Fast, unordered lists
- *
- * Supports add, remove, and iterate
- *
- * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot
- * allocation and freeing.
- *
- * This means that adding, removing, and iterating over items is lockless,
- * except when refilling/emptying the percpu slot buffers.
- */
-
-#include "fast_list.h"
-
-struct fast_list_pcpu {
- u32 nr;
- u32 entries[31];
-};
-
-static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp)
-{
- int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp);
- if (unlikely(idx < 0))
- return 0;
-
- if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) {
- ida_free(&l->slots_allocated, idx);
- return 0;
- }
-
- return idx;
-}
-
-/**
- * fast_list_get_idx - get a slot in a fast_list
- * @l: list to get slot in
- *
- * This allocates a slot in the radix tree without storing to it, so that we can
- * take the potential memory allocation failure early and do the list add later
- * when we can't take an allocation failure.
- *
- * Returns: positive integer on success, -ENOMEM on failure
- */
-int fast_list_get_idx(struct fast_list *l)
-{
- unsigned long flags;
- int idx;
-retry:
- local_irq_save(flags);
- struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
-
- if (unlikely(!lp->nr)) {
- u32 entries[16], nr = 0;
-
- local_irq_restore(flags);
- while (nr < ARRAY_SIZE(entries) &&
- (idx = fast_list_alloc_idx(l, GFP_KERNEL)))
- entries[nr++] = idx;
- local_irq_save(flags);
-
- lp = this_cpu_ptr(l->buffer);
-
- while (nr && lp->nr < ARRAY_SIZE(lp->entries))
- lp->entries[lp->nr++] = entries[--nr];
-
- if (unlikely(nr)) {
- local_irq_restore(flags);
- while (nr)
- ida_free(&l->slots_allocated, entries[--nr]);
- goto retry;
- }
-
- if (unlikely(!lp->nr)) {
- local_irq_restore(flags);
- return -ENOMEM;
- }
- }
-
- idx = lp->entries[--lp->nr];
- local_irq_restore(flags);
-
- return idx;
-}
-
-/**
- * fast_list_add - add an item to a fast_list
- * @l: list
- * @item: item to add
- *
- * Allocates a slot in the radix tree and stores to it and then returns the
- * slot index, which must be passed to fast_list_remove().
- *
- * Returns: positive integer on success, -ENOMEM on failure
- */
-int fast_list_add(struct fast_list *l, void *item)
-{
- int idx = fast_list_get_idx(l);
- if (idx < 0)
- return idx;
-
- *genradix_ptr_inlined(&l->items, idx) = item;
- return idx;
-}
-
-/**
- * fast_list_remove - remove an item from a fast_list
- * @l: list
- * @idx: item's slot index
- *
- * Zeroes out the slot in the radix tree and frees the slot for future
- * fast_list_add() operations.
- */
-void fast_list_remove(struct fast_list *l, unsigned idx)
-{
- u32 entries[16], nr = 0;
- unsigned long flags;
-
- if (!idx)
- return;
-
- *genradix_ptr_inlined(&l->items, idx) = NULL;
-
- local_irq_save(flags);
- struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
-
- if (unlikely(lp->nr == ARRAY_SIZE(lp->entries)))
- while (nr < ARRAY_SIZE(entries))
- entries[nr++] = lp->entries[--lp->nr];
-
- lp->entries[lp->nr++] = idx;
- local_irq_restore(flags);
-
- if (unlikely(nr))
- while (nr)
- ida_free(&l->slots_allocated, entries[--nr]);
-}
-
-void fast_list_exit(struct fast_list *l)
-{
- /* XXX: warn if list isn't empty */
- free_percpu(l->buffer);
- ida_destroy(&l->slots_allocated);
- genradix_free(&l->items);
-}
-
-int fast_list_init(struct fast_list *l)
-{
- genradix_init(&l->items);
- ida_init(&l->slots_allocated);
- l->buffer = alloc_percpu(*l->buffer);
- if (!l->buffer)
- return -ENOMEM;
- return 0;
-}
diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h
deleted file mode 100644
index 73c9bf591fd6..000000000000
--- a/fs/bcachefs/fast_list.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _LINUX_FAST_LIST_H
-#define _LINUX_FAST_LIST_H
-
-#include <linux/generic-radix-tree.h>
-#include <linux/idr.h>
-#include <linux/percpu.h>
-
-struct fast_list_pcpu;
-
-struct fast_list {
- GENRADIX(void *) items;
- struct ida slots_allocated;;
- struct fast_list_pcpu __percpu
- *buffer;
-};
-
-static inline void *fast_list_iter_peek(struct genradix_iter *iter,
- struct fast_list *list)
-{
- void **p;
- while ((p = genradix_iter_peek(iter, &list->items)) && !*p)
- genradix_iter_advance(iter, &list->items);
-
- return p ? *p : NULL;
-}
-
-#define fast_list_for_each_from(_list, _iter, _i, _start) \
- for (_iter = genradix_iter_init(&(_list)->items, _start); \
- (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \
- genradix_iter_advance(&(_iter), &(_list)->items))
-
-#define fast_list_for_each(_list, _iter, _i) \
- fast_list_for_each_from(_list, _iter, _i, 0)
-
-int fast_list_get_idx(struct fast_list *l);
-int fast_list_add(struct fast_list *l, void *item);
-void fast_list_remove(struct fast_list *l, unsigned idx);
-void fast_list_exit(struct fast_list *l);
-int fast_list_init(struct fast_list *l);
-
-#endif /* _LINUX_FAST_LIST_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
deleted file mode 100644
index d8153fe27037..000000000000
--- a/fs/bcachefs/fifo.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FIFO_H
-#define _BCACHEFS_FIFO_H
-
-#include "util.h"
-
-#define FIFO(type) \
-struct { \
- size_t front, back, size, mask; \
- type *data; \
-}
-
-#define DECLARE_FIFO(type, name) FIFO(type) name
-
-#define fifo_buf_size(fifo) \
- ((fifo)->size \
- ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \
- : 0)
-
-#define init_fifo(fifo, _size, _gfp) \
-({ \
- (fifo)->front = (fifo)->back = 0; \
- (fifo)->size = (_size); \
- (fifo)->mask = (fifo)->size \
- ? roundup_pow_of_two((fifo)->size) - 1 \
- : 0; \
- (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
-})
-
-#define free_fifo(fifo) \
-do { \
- kvfree((fifo)->data); \
- (fifo)->data = NULL; \
-} while (0)
-
-#define fifo_swap(l, r) \
-do { \
- swap((l)->front, (r)->front); \
- swap((l)->back, (r)->back); \
- swap((l)->size, (r)->size); \
- swap((l)->mask, (r)->mask); \
- swap((l)->data, (r)->data); \
-} while (0)
-
-#define fifo_move(dest, src) \
-do { \
- typeof(*((dest)->data)) _t; \
- while (!fifo_full(dest) && \
- fifo_pop(src, _t)) \
- fifo_push(dest, _t); \
-} while (0)
-
-#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
-#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
-
-#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
-#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
-
-#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
-#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
-
-#define fifo_entry_idx_abs(fifo, p) \
- ((((p) >= &fifo_peek_front(fifo) \
- ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
- (((p) - (fifo)->data)))
-
-#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
-
-#define fifo_push_back_ref(f) \
- (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
-
-#define fifo_push_front_ref(f) \
- (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
-
-#define fifo_push_back(fifo, new) \
-({ \
- typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \
- if (_r) \
- *_r = (new); \
- _r != NULL; \
-})
-
-#define fifo_push_front(fifo, new) \
-({ \
- typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \
- if (_r) \
- *_r = (new); \
- _r != NULL; \
-})
-
-#define fifo_pop_front(fifo, i) \
-({ \
- bool _r = !fifo_empty((fifo)); \
- if (_r) \
- (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
- _r; \
-})
-
-#define fifo_pop_back(fifo, i) \
-({ \
- bool _r = !fifo_empty((fifo)); \
- if (_r) \
- (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
- _r; \
-})
-
-#define fifo_push_ref(fifo) fifo_push_back_ref(fifo)
-#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
-#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
-#define fifo_peek(fifo) fifo_peek_front(fifo)
-
-#define fifo_for_each_entry(_entry, _fifo, _iter) \
- for (typecheck(typeof((_fifo)->front), _iter), \
- (_iter) = (_fifo)->front; \
- ((_iter != (_fifo)->back) && \
- (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
- (_iter)++)
-
-#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
- for (typecheck(typeof((_fifo)->front), _iter), \
- (_iter) = (_fifo)->front; \
- ((_iter != (_fifo)->back) && \
- (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
- (_iter)++)
-
-#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
deleted file mode 100644
index 1c54b9b5bd69..000000000000
--- a/fs/bcachefs/fs-io-buffered.c
+++ /dev/null
@@ -1,1109 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/backing-dev.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
- if (bio->bi_vcnt >= bio->bi_max_vecs)
- return true;
- if (bio->bi_iter.bi_size > UINT_MAX - len)
- return true;
- return false;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
- struct folio_iter fi;
-
- bio_for_each_folio_all(fi, bio)
- folio_end_read(fi.folio, bio->bi_status == BLK_STS_OK);
-
- bio_put(bio);
-}
-
-struct readpages_iter {
- struct address_space *mapping;
- unsigned idx;
- folios folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
- struct readahead_control *ractl)
-{
- struct folio *folio;
-
- *iter = (struct readpages_iter) { ractl->mapping };
-
- while ((folio = __readahead_folio(ractl))) {
- if (!bch2_folio_create(folio, GFP_KERNEL) ||
- darray_push(&iter->folios, folio)) {
- bch2_folio_release(folio);
- ractl->_nr_pages += folio_nr_pages(folio);
- ractl->_index -= folio_nr_pages(folio);
- return iter->folios.nr ? 0 : -ENOMEM;
- }
-
- folio_put(folio);
- }
-
- return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
- if (iter->idx >= iter->folios.nr)
- return NULL;
- return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
- iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (crc.csum_type || crc.compression_type)
- return true;
- return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
- struct readpages_iter *iter,
- struct bio *bio,
- unsigned sectors_this_extent,
- bool get_more)
-{
- /* Don't hold btree locks while allocating memory: */
- bch2_trans_unlock(trans);
-
- while (bio_sectors(bio) < sectors_this_extent &&
- bio->bi_vcnt < bio->bi_max_vecs) {
- struct folio *folio = readpage_iter_peek(iter);
- int ret;
-
- if (folio) {
- readpage_iter_advance(iter);
- } else {
- pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
- if (!get_more)
- break;
-
- unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
-
- if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
- break;
-
- unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
-
- /* ensure proper alignment */
- order = min(order, __ffs(folio_offset|BIT(31)));
-
- folio = xa_load(&iter->mapping->i_pages, folio_offset);
- if (folio && !xa_is_value(folio))
- break;
-
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
- if (!folio)
- break;
-
- if (!__bch2_folio_create(folio, GFP_KERNEL)) {
- folio_put(folio);
- break;
- }
-
- ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
- if (ret) {
- __bch2_folio_release(folio);
- folio_put(folio);
- break;
- }
-
- folio_put(folio);
- }
-
- BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
- BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
- }
-
- return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- subvol_inum inum,
- struct readpages_iter *readpages_iter)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_buf sk;
- int flags = BCH_READ_retry_if_stale|
- BCH_READ_may_promote;
- int ret = 0;
-
- rbio->subvol = inum.subvol;
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, rbio->bio.bi_iter.bi_sector),
- BTREE_ITER_slots);
- while (1) {
- struct bkey_s_c k;
- unsigned bytes, sectors;
- s64 offset_into_extent;
- enum btree_id data_btree = BTREE_ID_extents;
-
- bch2_trans_begin(trans);
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
- bch2_btree_iter_set_pos(trans, &iter,
- POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &sk);
- if (ret)
- goto err;
-
- k = bkey_i_to_s_c(sk.k);
-
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
- if (readpages_iter) {
- ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
- extent_partial_reads_expensive(k));
- if (ret)
- goto err;
- }
-
- bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
- swap(rbio->bio.bi_iter.bi_size, bytes);
-
- if (rbio->bio.bi_iter.bi_size == bytes)
- flags |= BCH_READ_last_fragment;
-
- bch2_bio_page_state_set(&rbio->bio, k);
-
- bch2_read_extent(trans, rbio, iter.pos,
- data_btree, k, offset_into_extent, flags);
- /*
- * Careful there's a landmine here if bch2_read_extent() ever
- * starts returning transaction restarts here.
- *
- * We've changed rbio->bi_iter.bi_size to be "bytes we can read
- * from this extent" with the swap call, and we restore it
- * below. That restore needs to come before checking for
- * errors.
- *
- * But unlike __bch2_read(), we use the rbio bvec iter, not one
- * on the stack, so we can't do the restore right after the
- * bch2_read_extent() call: we don't own that iterator anymore
- * if BCH_READ_last_fragment is set, since we may have submitted
- * that rbio instead of cloning it.
- */
-
- if (flags & BCH_READ_last_fragment)
- break;
-
- swap(rbio->bio.bi_iter.bi_size, bytes);
- bio_advance(&rbio->bio, bytes);
-err:
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
- prt_printf(&buf, "read error %i from btree lookup", ret);
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- rbio->bio.bi_status = BLK_STS_IOERR;
- bio_endio(&rbio->bio);
- }
-
- bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
- struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
- struct folio *folio;
- struct readpages_iter readpages_iter;
- struct blk_plug plug;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- int ret = readpages_iter_init(&readpages_iter, ractl);
- if (ret)
- return;
-
- /*
- * Besides being a general performance optimization, plugging helps with
- * avoiding btree transaction srcu warnings - submitting a bio can
- * block, and we don't want todo that with the transaction locked.
- *
- * However, plugged bios are submitted when we schedule; we ideally
- * would have our own scheduler hook to call unlock_long() before
- * scheduling.
- */
- blk_start_plug(&plug);
- bch2_pagecache_add_get(inode);
-
- struct btree_trans *trans = bch2_trans_get(c);
- while ((folio = readpage_iter_peek(&readpages_iter))) {
- unsigned n = min_t(unsigned,
- readpages_iter.folios.nr -
- readpages_iter.idx,
- BIO_MAX_VECS);
- struct bch_read_bio *rbio =
- rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
- GFP_KERNEL, &c->bio_read),
- c,
- opts,
- bch2_readpages_end_io);
-
- readpage_iter_advance(&readpages_iter);
-
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bchfs_read(trans, rbio, inode_inum(inode),
- &readpages_iter);
- bch2_trans_unlock(trans);
- }
- bch2_trans_put(trans);
-
- bch2_pagecache_add_put(inode);
- blk_finish_plug(&plug);
- darray_exit(&readpages_iter.folios);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_read_bio *rbio;
- struct bch_io_opts opts;
- struct blk_plug plug;
- int ret;
- DECLARE_COMPLETION_ONSTACK(done);
-
- BUG_ON(folio_test_uptodate(folio));
- BUG_ON(folio_test_dirty(folio));
-
- if (!bch2_folio_create(folio, GFP_KERNEL))
- return -ENOMEM;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
- c,
- opts,
- bch2_read_single_folio_end_io);
- rbio->bio.bi_private = &done;
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- blk_start_plug(&plug);
- bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
- blk_finish_plug(&plug);
- wait_for_completion(&done);
-
- ret = blk_status_to_errno(rbio->bio.bi_status);
- bio_put(&rbio->bio);
-
- if (ret < 0)
- return ret;
-
- folio_mark_uptodate(folio);
- return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
- int ret;
-
- ret = bch2_read_single_folio(folio, folio->mapping);
- folio_unlock(folio);
- return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_io {
- struct bch_inode_info *inode;
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-struct bch_writepage_state {
- struct bch_writepage_io *io;
- struct bch_io_opts opts;
- struct bch_folio_sector *tmp;
- unsigned tmp_sectors;
- struct blk_plug plug;
-};
-
-/*
- * Determine when a writepage io is full. We have to limit writepage bios to a
- * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
- * what the bounce path in bch2_write_extent() can handle. In theory we could
- * loosen this restriction for non-bounce I/O, but we don't have that context
- * here. Ideally, we can up this limit and make it configurable in the future
- * when the bounce path can be enhanced to accommodate larger source bios.
- */
-static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
-{
- struct bio *bio = &io->op.wbio.bio;
- return bio_full(bio, len) ||
- (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
- struct bch_writepage_io *io =
- container_of(op, struct bch_writepage_io, op);
- struct bch_fs *c = io->op.c;
- struct bio *bio = &io->op.wbio.bio;
- struct folio_iter fi;
- unsigned i;
-
- if (io->op.error) {
- set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s;
-
- mapping_set_error(fi.folio->mapping, -EIO);
-
- s = __bch2_folio(fi.folio);
- spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fi.folio); i++)
- s->s[i].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
- }
-
- if (io->op.flags & BCH_WRITE_wrote_data_inline) {
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s;
-
- s = __bch2_folio(fi.folio);
- spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fi.folio); i++)
- s->s[i].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
- }
-
- /*
- * racing with fallocate can cause us to add fewer sectors than
- * expected - but we shouldn't add more sectors than expected:
- */
- WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
- /*
- * (error (due to going RO) halfway through a page can screw that up
- * slightly)
- * XXX wtf?
- BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
- */
-
- /*
- * The writeback flag is effectively our ref on the inode -
- * fixup i_blocks before calling folio_end_writeback:
- */
- bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s = __bch2_folio(fi.folio);
-
- if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(fi.folio);
- }
-
- bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
- struct bch_writepage_io *io = w->io;
-
- w->io = NULL;
- closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
- struct writeback_control *wbc,
- struct bch_writepage_state *w,
- struct bch_inode_info *inode,
- u64 sector,
- unsigned nr_replicas)
-{
- struct bch_write_op *op;
-
- w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
- REQ_OP_WRITE,
- GFP_KERNEL,
- &c->writepage_bioset),
- struct bch_writepage_io, op.wbio.bio);
-
- w->io->inode = inode;
- op = &w->io->op;
- bch2_write_op_init(op, c, w->opts);
- op->target = w->opts.foreground_target;
- op->nr_replicas = nr_replicas;
- op->res.nr_replicas = nr_replicas;
- op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_inum.subvol;
- op->pos = POS(inode->v.i_ino, sector);
- op->end_io = bch2_writepage_io_done;
- op->devs_need_flush = &inode->ei_devs_need_flush;
- op->wbio.bio.bi_iter.bi_sector = sector;
- op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
-{
- struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_writepage_state *w = data;
- struct bch_folio *s;
- unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
- loff_t i_size = i_size_read(&inode->v);
- int ret;
-
- EBUG_ON(!folio_test_uptodate(folio));
-
- /* Is the folio fully inside i_size? */
- if (folio_end_pos(folio) <= i_size)
- goto do_io;
-
- /* Is the folio fully outside i_size? (truncate in progress) */
- if (folio_pos(folio) >= i_size) {
- folio_unlock(folio);
- return 0;
- }
-
- /*
- * The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the folio size. For a file that is not a multiple of
- * the folio size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- folio_zero_segment(folio,
- i_size - folio_pos(folio),
- folio_size(folio));
-do_io:
- f_sectors = folio_sectors(folio);
- s = bch2_folio(folio);
-
- if (f_sectors > w->tmp_sectors) {
- kfree(w->tmp);
- w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
- w->tmp_sectors = f_sectors;
- }
-
- /*
- * Things get really hairy with errors during writeback:
- */
- ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
- BUG_ON(ret);
-
- /* Before unlocking the page, get copy of reservations: */
- spin_lock(&s->lock);
- memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
- for (i = 0; i < f_sectors; i++) {
- if (s->s[i].state < SECTOR_dirty)
- continue;
-
- nr_replicas_this_write =
- min_t(unsigned, nr_replicas_this_write,
- s->s[i].nr_replicas +
- s->s[i].replicas_reserved);
- }
-
- for (i = 0; i < f_sectors; i++) {
- if (s->s[i].state < SECTOR_dirty)
- continue;
-
- s->s[i].nr_replicas = w->opts.compression
- ? 0 : nr_replicas_this_write;
-
- s->s[i].replicas_reserved = 0;
- bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
- }
- spin_unlock(&s->lock);
-
- BUG_ON(atomic_read(&s->write_count));
- atomic_set(&s->write_count, 1);
-
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
-
- folio_unlock(folio);
-
- offset = 0;
- while (1) {
- unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
- u64 sector;
-
- while (offset < f_sectors &&
- w->tmp[offset].state < SECTOR_dirty)
- offset++;
-
- if (offset == f_sectors)
- break;
-
- while (offset + sectors < f_sectors &&
- w->tmp[offset + sectors].state >= SECTOR_dirty) {
- reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
- dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
- sectors++;
- }
- BUG_ON(!sectors);
-
- sector = folio_sector(folio) + offset;
-
- if (w->io &&
- (w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bch_io_full(w->io, sectors << 9) ||
- bio_end_sector(&w->io->op.wbio.bio) != sector))
- bch2_writepage_do_io(w);
-
- if (!w->io)
- bch2_writepage_io_alloc(c, wbc, w, inode, sector,
- nr_replicas_this_write);
-
- atomic_inc(&s->write_count);
-
- BUG_ON(inode != w->io->inode);
- BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
- sectors << 9, offset << 9));
-
- w->io->op.res.sectors += reserved_sectors;
- w->io->op.i_sectors_delta -= dirty_sectors;
- w->io->op.new_i_size = i_size;
-
- offset += sectors;
- }
-
- if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(folio);
-
- return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
- struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
-
- bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
-
- blk_start_plug(&w->plug);
- int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
- if (w->io)
- bch2_writepage_do_io(w);
- blk_finish_plug(&w->plug);
- kfree(w->tmp);
- kfree(w);
- return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation *res;
- struct folio *folio;
- unsigned offset;
- int ret = -ENOMEM;
-
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
- return -ENOMEM;
-
- bch2_folio_reservation_init(c, inode, res);
- *fsdata = res;
-
- bch2_pagecache_add_get(inode);
-
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_WRITEBEGIN | fgf_set_order(len),
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- goto err_unlock;
-
- offset = pos - folio_pos(folio);
- len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
- if (folio_test_uptodate(folio))
- goto out;
-
- /* If we're writing entire folio, don't need to read it in first: */
- if (!offset && len == folio_size(folio))
- goto out;
-
- if (!offset && pos + len >= inode->v.i_size) {
- folio_zero_segment(folio, len, folio_size(folio));
- flush_dcache_folio(folio);
- goto out;
- }
-
- if (folio_pos(folio) >= inode->v.i_size) {
- folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
- flush_dcache_folio(folio);
- goto out;
- }
-readpage:
- ret = bch2_read_single_folio(folio, mapping);
- if (ret)
- goto err;
-out:
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto err;
-
- ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
- if (ret) {
- if (!folio_test_uptodate(folio)) {
- /*
- * If the folio hasn't been read in, we won't know if we
- * actually need a reservation - we don't actually need
- * to read here, we just need to check if the folio is
- * fully backed by uncompressed data:
- */
- goto readpage;
- }
-
- goto err;
- }
-
- *foliop = folio;
- return 0;
-err:
- folio_unlock(folio);
- folio_put(folio);
-err_unlock:
- bch2_pagecache_add_put(inode);
- kfree(res);
- *fsdata = NULL;
- return bch2_err_class(ret);
-}
-
-int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation *res = fsdata;
- unsigned offset = pos - folio_pos(folio);
-
- lockdep_assert_held(&inode->v.i_rwsem);
- BUG_ON(offset + copied > folio_size(folio));
-
- if (unlikely(copied < len && !folio_test_uptodate(folio))) {
- /*
- * The folio needs to be read in, but that would destroy
- * our partial write - simplest thing is to just force
- * userspace to redo the write:
- */
- folio_zero_range(folio, 0, folio_size(folio));
- flush_dcache_folio(folio);
- copied = 0;
- }
-
- spin_lock(&inode->v.i_lock);
- if (pos + copied > inode->v.i_size)
- i_size_write(&inode->v, pos + copied);
- spin_unlock(&inode->v.i_lock);
-
- if (copied) {
- if (!folio_test_uptodate(folio))
- folio_mark_uptodate(folio);
-
- bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
- inode->ei_last_dirtied = (unsigned long) current;
- }
-
- folio_unlock(folio);
- folio_put(folio);
- bch2_pagecache_add_put(inode);
-
- bch2_folio_reservation_put(c, inode, res);
- kfree(res);
-
- return copied;
-}
-
-static noinline void folios_trunc(folios *fs, struct folio **fi)
-{
- while (fs->data + fs->nr > fi) {
- struct folio *f = darray_pop(fs);
-
- folio_unlock(f);
- folio_put(f);
- }
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
- struct address_space *mapping,
- struct iov_iter *iter,
- loff_t pos, unsigned len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation res;
- folios fs;
- struct folio *f;
- unsigned copied = 0, f_offset, f_copied;
- u64 end = pos + len, f_pos, f_len;
- loff_t last_folio_pos = inode->v.i_size;
- int ret = 0;
-
- BUG_ON(!len);
-
- bch2_folio_reservation_init(c, inode, &res);
- darray_init(&fs);
-
- ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
- FGP_WRITEBEGIN | fgf_set_order(len),
- mapping_gfp_mask(mapping), &fs);
- if (ret)
- goto out;
-
- BUG_ON(!fs.nr);
-
- f = darray_first(fs);
- if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
-
- f = darray_last(fs);
- end = min(end, folio_end_pos(f));
- last_folio_pos = folio_pos(f);
- if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
- if (end >= inode->v.i_size) {
- folio_zero_range(f, 0, folio_size(f));
- } else {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
- }
-
- ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
- if (ret)
- goto out;
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(fs));
- darray_for_each(fs, fi) {
- ssize_t f_reserved;
-
- f = *fi;
- f_len = min(end, folio_end_pos(f)) - f_pos;
- f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
-
- if (unlikely(f_reserved != f_len)) {
- if (f_reserved < 0) {
- if (f == darray_first(fs)) {
- ret = f_reserved;
- goto out;
- }
-
- folios_trunc(&fs, fi);
- end = min(end, folio_end_pos(darray_last(fs)));
- } else {
- if (!folio_test_uptodate(f)) {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
-
- folios_trunc(&fs, fi + 1);
- end = f_pos + f_reserved;
- }
-
- break;
- }
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- if (mapping_writably_mapped(mapping))
- darray_for_each(fs, fi)
- flush_dcache_folio(*fi);
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(fs));
- darray_for_each(fs, fi) {
- f = *fi;
- f_len = min(end, folio_end_pos(f)) - f_pos;
- f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
- if (!f_copied) {
- folios_trunc(&fs, fi);
- break;
- }
-
- if (!folio_test_uptodate(f) &&
- f_copied != folio_size(f) &&
- pos + copied + f_copied < inode->v.i_size) {
- iov_iter_revert(iter, f_copied);
- folio_zero_range(f, 0, folio_size(f));
- folios_trunc(&fs, fi);
- break;
- }
-
- flush_dcache_folio(f);
- copied += f_copied;
-
- if (f_copied != f_len) {
- folios_trunc(&fs, fi + 1);
- break;
- }
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- if (!copied)
- goto out;
-
- end = pos + copied;
-
- spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size)
- i_size_write(&inode->v, end);
- spin_unlock(&inode->v.i_lock);
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(fs));
- darray_for_each(fs, fi) {
- f = *fi;
- f_len = min(end, folio_end_pos(f)) - f_pos;
-
- if (!folio_test_uptodate(f))
- folio_mark_uptodate(f);
-
- bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- inode->ei_last_dirtied = (unsigned long) current;
-out:
- darray_for_each(fs, fi) {
- folio_unlock(*fi);
- folio_put(*fi);
- }
-
- /*
- * If the last folio added to the mapping starts beyond current EOF, we
- * performed a short write but left around at least one post-EOF folio.
- * Clean up the mapping before we return.
- */
- if (last_folio_pos >= inode->v.i_size)
- truncate_pagecache(&inode->v, inode->v.i_size);
-
- darray_exit(&fs);
- bch2_folio_reservation_put(c, inode, &res);
-
- return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
- int ret = 0;
-
- bch2_pagecache_add_get(inode);
-
- do {
- unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned bytes = iov_iter_count(iter);
-again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
- bytes = min_t(unsigned long, iov_iter_count(iter),
- PAGE_SIZE - offset);
-
- if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
- ret = -EFAULT;
- break;
- }
- }
-
- if (unlikely(fatal_signal_pending(current))) {
- ret = -EINTR;
- break;
- }
-
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
- if (unlikely(ret < 0))
- break;
-
- cond_resched();
-
- if (unlikely(ret == 0)) {
- /*
- * If we were unable to copy any data at all, we must
- * fall back to a single segment length write.
- *
- * If we didn't fallback here, we could livelock
- * because not all segments in the iov can be copied at
- * once without a pagefault.
- */
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_single_seg_count(iter));
- goto again;
- }
- pos += ret;
- written += ret;
- ret = 0;
-
- balance_dirty_pages_ratelimited(mapping);
- } while (iov_iter_count(iter));
-
- bch2_pagecache_add_put(inode);
-
- return written ? written : ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = bch2_direct_write(iocb, from);
- goto out;
- }
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs(file);
- if (ret)
- goto unlock;
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- ret = bch2_buffered_write(iocb, from);
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-unlock:
- inode_unlock(&inode->v);
-
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-out:
- return bch2_err_class(ret);
-}
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
-{
- bioset_exit(&c->writepage_bioset);
-}
-
-int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
-{
- if (bioset_init(&c->writepage_bioset,
- 4, offsetof(struct bch_writepage_io, op.wbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
deleted file mode 100644
index 14de91c27656..000000000000
--- a/fs/bcachefs/fs-io-buffered.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_BUFFERED_H
-#define _BCACHEFS_FS_IO_BUFFERED_H
-
-#ifndef NO_BCACHEFS_FS
-
-int bch2_read_single_folio(struct folio *, struct address_space *);
-int bch2_read_folio(struct file *, struct folio *);
-
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
-
-int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
- unsigned len, struct folio **, void **);
-int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
- unsigned len, unsigned copied, struct folio *, void *);
-
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
-int bch2_fs_fs_io_buffered_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
deleted file mode 100644
index 1f5154d9676b..000000000000
--- a/fs/bcachefs/fs-io-direct.c
+++ /dev/null
@@ -1,704 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "enumerated_ref.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "io_read.h"
-#include "io_write.h"
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/prefetch.h>
-#include <linux/task_io_accounting_ops.h>
-
-/* O_DIRECT reads */
-
-struct dio_read {
- struct closure cl;
- struct kiocb *req;
- long ret;
- bool should_dirty;
- struct bch_read_bio rbio;
-};
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
- if (check_dirty) {
- bio_check_pages_dirty(bio);
- } else {
- bio_release_pages(bio, false);
- bio_put(bio);
- }
-}
-
-static CLOSURE_CALLBACK(bch2_dio_read_complete)
-{
- closure_type(dio, struct dio_read, cl);
-
- dio->req->ki_complete(dio->req, dio->ret);
- bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
-
- if (bio->bi_status)
- dio->ret = blk_status_to_errno(bio->bi_status);
-
- closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
- bool should_dirty = dio->should_dirty;
-
- bch2_direct_IO_read_endio(bio);
- bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
- struct file *file = req->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
- struct dio_read *dio;
- struct bio *bio;
- struct blk_plug plug;
- loff_t offset = req->ki_pos;
- bool sync = is_sync_kiocb(req);
- bool split = false;
- size_t shorten;
- ssize_t ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- /* bios must be 512 byte aligned: */
- if ((offset|iter->count) & (SECTOR_SIZE - 1))
- return -EINVAL;
-
- ret = min_t(loff_t, iter->count,
- max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
- if (!ret)
- return ret;
-
- shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
- if (shorten >= iter->count)
- shorten = 0;
- iter->count -= shorten;
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_READ,
- GFP_KERNEL,
- &c->dio_read_bioset);
-
- dio = container_of(bio, struct dio_read, rbio.bio);
- closure_init(&dio->cl, NULL);
-
- /*
- * this is a _really_ horrible hack just to avoid an atomic sub at the
- * end:
- */
- if (!sync) {
- set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER -
- CLOSURE_RUNNING +
- CLOSURE_DESTRUCTOR);
- } else {
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER + 1);
- dio->cl.closure_get_happened = true;
- }
-
- dio->req = req;
- dio->ret = ret;
- /*
- * This is one of the sketchier things I've encountered: we have to skip
- * the dirtying of requests that are internal from the kernel (i.e. from
- * loopback), because we'll deadlock on page_lock.
- */
- dio->should_dirty = iter_is_iovec(iter);
-
- blk_start_plug(&plug);
-
- goto start;
- while (iter->count) {
- split = true;
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_READ,
- GFP_KERNEL,
- &c->bio_read);
-start:
- bio->bi_opf = REQ_OP_READ|REQ_SYNC;
- bio->bi_iter.bi_sector = offset >> 9;
- bio->bi_private = dio;
-
- ret = bio_iov_iter_get_pages(bio, iter);
- if (ret < 0) {
- /* XXX: fault inject this path */
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- break;
- }
-
- offset += bio->bi_iter.bi_size;
-
- if (dio->should_dirty)
- bio_set_pages_dirty(bio);
-
- if (iter->count)
- closure_get(&dio->cl);
-
- struct bch_read_bio *rbio =
- rbio_init(bio,
- c,
- opts,
- split
- ? bch2_direct_IO_read_split_endio
- : bch2_direct_IO_read_endio);
-
- bch2_read(c, rbio, inode_inum(inode));
- }
-
- blk_finish_plug(&plug);
-
- iter->count += shorten;
-
- if (sync) {
- closure_sync(&dio->cl);
- closure_debug_destroy(&dio->cl);
- ret = dio->ret;
- bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
- return ret;
- } else {
- return -EIOCBQUEUED;
- }
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = file->f_mapping;
- size_t count = iov_iter_count(iter);
- ssize_t ret = 0;
-
- if (!count)
- return 0; /* skip atime */
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct blk_plug plug;
-
- if (unlikely(mapping->nrpages)) {
- ret = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret < 0)
- goto out;
- }
-
- file_accessed(file);
-
- blk_start_plug(&plug);
- ret = bch2_direct_IO_read(iocb, iter);
- blk_finish_plug(&plug);
-
- if (ret >= 0)
- iocb->ki_pos += ret;
- } else {
- bch2_pagecache_add_get(inode);
- ret = filemap_read(iocb, iter, ret);
- bch2_pagecache_add_put(inode);
- }
-out:
- return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-struct dio_write {
- struct kiocb *req;
- struct address_space *mapping;
- struct bch_inode_info *inode;
- struct mm_struct *mm;
- const struct iovec *iov;
- unsigned loop:1,
- extending:1,
- sync:1,
- flush:1;
- struct quota_res quota_res;
- u64 written;
-
- struct iov_iter iter;
- struct iovec inline_vecs[2];
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
- u64 offset, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 end = offset + size;
- u32 snapshot;
- bool ret = true;
- int err;
-retry:
- bch2_trans_begin(trans);
-
- err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (err)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_slots, k, err) {
- if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
- break;
-
- if (k.k->p.snapshot != snapshot ||
- nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(err, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
-
- return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct bch_inode_info *inode = dio->inode;
- struct bio *bio = &dio->op.wbio.bio;
-
- return bch2_check_range_allocated(c, inode_inum(inode),
- dio->op.pos.offset, bio_sectors(bio),
- dio->op.opts.data_replicas,
- dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
- struct iovec *iov = dio->inline_vecs;
-
- /*
- * iov_iter has a single embedded iovec - nothing to do:
- */
- if (iter_is_ubuf(&dio->iter))
- return 0;
-
- /*
- * We don't currently handle non-iovec iov_iters here - return an error,
- * and we'll fall back to doing the IO synchronously:
- */
- if (!iter_is_iovec(&dio->iter))
- return -1;
-
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
- GFP_KERNEL);
- if (unlikely(!iov))
- return -ENOMEM;
- }
-
- memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
- dio->iter.__iov = iov;
- return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
-{
- closure_type(dio, struct dio_write, op.cl);
- struct bch_fs *c = dio->op.c;
-
- closure_debug_destroy(cl);
-
- dio->op.error = bch2_journal_error(&c->journal);
-
- bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct bch_inode_unpacked inode;
- int ret;
-
- dio->flush = 0;
-
- closure_init(&dio->op.cl, NULL);
-
- if (!dio->op.error) {
- ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
- if (ret) {
- dio->op.error = ret;
- } else {
- bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
- &dio->op.cl);
- bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
- }
- }
-
- if (dio->sync) {
- closure_sync(&dio->op.cl);
- closure_debug_destroy(&dio->op.cl);
- } else {
- continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
- }
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct bch_inode_info *inode = dio->inode;
- bool sync = dio->sync;
- long ret;
-
- if (unlikely(dio->flush)) {
- bch2_dio_write_flush(dio);
- if (!sync)
- return -EIOCBQUEUED;
- }
-
- bch2_pagecache_block_put(inode);
-
- kfree(dio->iov);
-
- ret = dio->op.error ?: ((long) dio->written << 9);
- bio_put(&dio->op.wbio.bio);
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
-
- /* inode->i_dio_count is our ref on inode and thus bch_fs */
- inode_dio_end(&inode->v);
-
- if (ret < 0)
- ret = bch2_err_class(ret);
-
- if (!sync) {
- req->ki_complete(req, ret);
- ret = -EIOCBQUEUED;
- }
- return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct bch_inode_info *inode = dio->inode;
- struct bio *bio = &dio->op.wbio.bio;
-
- req->ki_pos += (u64) dio->op.written << 9;
- dio->written += dio->op.written;
-
- if (dio->extending) {
- spin_lock(&inode->v.i_lock);
- if (req->ki_pos > inode->v.i_size)
- i_size_write(&inode->v, req->ki_pos);
- spin_unlock(&inode->v.i_lock);
- }
-
- if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
- __bch2_quota_reservation_put(c, inode, &dio->quota_res);
- mutex_unlock(&inode->ei_quota_lock);
- }
-
- bio_release_pages(bio, false);
-
- if (unlikely(dio->op.error))
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct address_space *mapping = dio->mapping;
- struct bch_inode_info *inode = dio->inode;
- struct bch_io_opts opts;
- struct bio *bio = &dio->op.wbio.bio;
- unsigned unaligned, iter_count;
- bool sync = dio->sync, dropped_locks;
- long ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- while (1) {
- iter_count = dio->iter.count;
-
- EBUG_ON(current->faults_disabled_mapping);
- current->faults_disabled_mapping = mapping;
-
- ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
- dropped_locks = fdm_dropped_locks();
-
- current->faults_disabled_mapping = NULL;
-
- /*
- * If the fault handler returned an error but also signalled
- * that it dropped & retook ei_pagecache_lock, we just need to
- * re-shoot down the page cache and retry:
- */
- if (dropped_locks && ret)
- ret = 0;
-
- if (unlikely(ret < 0))
- goto err;
-
- if (unlikely(dropped_locks)) {
- ret = bch2_write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter_count - 1);
- if (unlikely(ret))
- goto err;
-
- if (!bio->bi_iter.bi_size)
- continue;
- }
-
- unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
- bio->bi_iter.bi_size -= unaligned;
- iov_iter_revert(&dio->iter, unaligned);
-
- if (!bio->bi_iter.bi_size) {
- /*
- * bio_iov_iter_get_pages was only able to get <
- * blocksize worth of pages:
- */
- ret = -EFAULT;
- goto err;
- }
-
- bch2_write_op_init(&dio->op, c, opts);
- dio->op.end_io = sync
- ? NULL
- : bch2_dio_write_loop_async;
- dio->op.target = dio->op.opts.foreground_target;
- dio->op.write_point = writepoint_hashed((unsigned long) current);
- dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_inum.subvol;
- dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
- dio->op.devs_need_flush = &inode->ei_devs_need_flush;
-
- if (sync)
- dio->op.flags |= BCH_WRITE_sync;
- dio->op.flags |= BCH_WRITE_check_enospc;
-
- ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
- bio_sectors(bio), true);
- if (unlikely(ret))
- goto err;
-
- ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
- dio->op.opts.data_replicas, 0);
- if (unlikely(ret) &&
- !bch2_dio_write_check_allocated(dio))
- goto err;
-
- task_io_account_write(bio->bi_iter.bi_size);
-
- if (unlikely(dio->iter.count) &&
- !dio->sync &&
- !dio->loop &&
- bch2_dio_write_copy_iov(dio))
- dio->sync = sync = true;
-
- dio->loop = true;
- closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
- if (!sync)
- return -EIOCBQUEUED;
-
- bch2_dio_write_end(dio);
-
- if (likely(!dio->iter.count) || dio->op.error)
- break;
-
- bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
- }
-out:
- return bch2_dio_write_done(dio);
-err:
- dio->op.error = ret;
-
- bio_release_pages(bio, false);
-
- bch2_quota_reservation_put(c, inode, &dio->quota_res);
- goto out;
-}
-
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
-{
- struct mm_struct *mm = dio->mm;
-
- bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
-
- if (mm)
- kthread_use_mm(mm);
- bch2_dio_write_loop(dio);
- if (mm)
- kthread_unuse_mm(mm);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
- struct dio_write *dio = container_of(op, struct dio_write, op);
-
- bch2_dio_write_end(dio);
-
- if (likely(!dio->iter.count) || dio->op.error)
- bch2_dio_write_done(dio);
- else
- bch2_dio_write_continue(dio);
-}
-
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
-{
- struct file *file = req->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct dio_write *dio;
- struct bio *bio;
- bool locked = true, extending;
- ssize_t ret;
-
- prefetch(&c->opts);
- prefetch((void *) &c->opts + 64);
- prefetch(&inode->ei_inode);
- prefetch((void *) &inode->ei_inode + 64);
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write))
- return -EROFS;
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(req, iter);
- if (unlikely(ret <= 0))
- goto err_put_write_ref;
-
- ret = file_remove_privs(file);
- if (unlikely(ret))
- goto err_put_write_ref;
-
- ret = file_update_time(file);
- if (unlikely(ret))
- goto err_put_write_ref;
-
- if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
- ret = -EINVAL;
- goto err_put_write_ref;
- }
-
- inode_dio_begin(&inode->v);
- bch2_pagecache_block_get(inode);
-
- extending = req->ki_pos + iter->count > inode->v.i_size;
- if (!extending) {
- inode_unlock(&inode->v);
- locked = false;
- }
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
- GFP_KERNEL,
- &c->dio_write_bioset);
- dio = container_of(bio, struct dio_write, op.wbio.bio);
- dio->req = req;
- dio->mapping = mapping;
- dio->inode = inode;
- dio->mm = current->mm;
- dio->iov = NULL;
- dio->loop = false;
- dio->extending = extending;
- dio->sync = is_sync_kiocb(req) || extending;
- dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
- dio->quota_res.sectors = 0;
- dio->written = 0;
- dio->iter = *iter;
- dio->op.c = c;
-
- if (unlikely(mapping->nrpages)) {
- ret = bch2_write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter->count - 1);
- if (unlikely(ret))
- goto err_put_bio;
- }
-
- ret = bch2_dio_write_loop(dio);
-out:
- if (locked)
- inode_unlock(&inode->v);
- return ret;
-err_put_bio:
- bch2_pagecache_block_put(inode);
- bio_put(bio);
- inode_dio_end(&inode->v);
-err_put_write_ref:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
- goto out;
-}
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
-{
- bioset_exit(&c->dio_write_bioset);
- bioset_exit(&c->dio_read_bioset);
-}
-
-int bch2_fs_fs_io_direct_init(struct bch_fs *c)
-{
- if (bioset_init(&c->dio_read_bioset,
- 4, offsetof(struct dio_read, rbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
- if (bioset_init(&c->dio_write_bioset,
- 4, offsetof(struct dio_write, op.wbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
deleted file mode 100644
index 814621ec7f81..000000000000
--- a/fs/bcachefs/fs-io-direct.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_DIRECT_H
-#define _BCACHEFS_FS_IO_DIRECT_H
-
-#ifndef NO_BCACHEFS_FS
-ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-
-void bch2_fs_fs_io_direct_exit(struct bch_fs *);
-int bch2_fs_fs_io_direct_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
deleted file mode 100644
index c2cc405822f2..000000000000
--- a/fs/bcachefs/fs-io-pagecache.c
+++ /dev/null
@@ -1,827 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "btree_iter.h"
-#include "extents.h"
-#include "fs-io.h"
-#include "fs-io-pagecache.h"
-#include "subvolume.h"
-
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
- loff_t start, u64 end,
- fgf_t fgp_flags, gfp_t gfp,
- folios *fs)
-{
- struct folio *f;
- u64 pos = start;
- int ret = 0;
-
- while (pos < end) {
- if ((u64) pos >= (u64) start + (1ULL << 20))
- fgp_flags &= ~FGP_CREAT;
-
- ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
- if (ret)
- break;
-
- f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
- if (IS_ERR(f))
- break;
-
- BUG_ON(fs->nr && folio_pos(f) != pos);
-
- pos = folio_end_pos(f);
- darray_push(fs, f);
- }
-
- if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
- ret = -ENOMEM;
-
- return fs->nr ? 0 : ret;
-}
-
-/* pagecache_block must be held */
-int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
- loff_t start, loff_t end)
-{
- int ret;
-
- /*
- * XXX: the way this is currently implemented, we can spin if a process
- * is continually redirtying a specific page
- */
- do {
- if (!mapping->nrpages)
- return 0;
-
- ret = filemap_write_and_wait_range(mapping, start, end);
- if (ret)
- break;
-
- if (!mapping->nrpages)
- return 0;
-
- ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- } while (ret == -EBUSY);
-
- return ret;
-}
-
-#if 0
-/* Useful for debug tracing: */
-static const char * const bch2_folio_sector_states[] = {
-#define x(n) #n,
- BCH_FOLIO_SECTOR_STATE()
-#undef x
- NULL
-};
-#endif
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_unallocated:
- return SECTOR_dirty;
- case SECTOR_reserved:
- return SECTOR_dirty_reserved;
- default:
- return state;
- }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_dirty:
- return SECTOR_unallocated;
- case SECTOR_dirty_reserved:
- return SECTOR_reserved;
- default:
- return state;
- }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_unallocated:
- return SECTOR_reserved;
- case SECTOR_dirty:
- return SECTOR_dirty_reserved;
- default:
- return state;
- }
-}
-
-/* for newly allocated folios: */
-struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
- struct bch_folio *s;
-
- s = kzalloc(sizeof(*s) +
- sizeof(struct bch_folio_sector) *
- folio_sectors(folio), gfp);
- if (!s)
- return NULL;
-
- spin_lock_init(&s->lock);
- folio_attach_private(folio, s);
- return s;
-}
-
-struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
- return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
- if (bkey_extent_is_reservation(k))
- return SECTOR_reserved;
- if (bkey_extent_is_allocation(k.k))
- return SECTOR_allocated;
- return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
- unsigned pg_offset, unsigned pg_len,
- unsigned nr_ptrs, unsigned state)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, sectors = folio_sectors(folio);
-
- BUG_ON(pg_offset >= sectors);
- BUG_ON(pg_offset + pg_len > sectors);
-
- spin_lock(&s->lock);
-
- for (i = pg_offset; i < pg_offset + pg_len; i++) {
- s->s[i].nr_replicas = nr_ptrs;
- bch2_folio_sector_set(folio, s, i, state);
- }
-
- if (i == sectors)
- s->uptodate = true;
-
- spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
- struct folio **fs, unsigned nr_folios)
-{
- u64 offset = folio_sector(fs[0]);
- bool need_set = false;
-
- for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- need_set |= !s->uptodate;
- }
-
- if (!need_set)
- return 0;
-
- unsigned folio_idx = 0;
-
- return bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
- POS(inum.inum, offset),
- POS(inum.inum, U64_MAX),
- inum.subvol, BTREE_ITER_slots, k, ({
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = fs[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
- folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) -
- folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- 0;
- })));
-}
-
-void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct folio_vec fv;
- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- bio_for_each_folio(fv, bio, iter)
- __bch2_folio_set(fv.fv_folio,
- fv.fv_offset >> 9,
- fv.fv_len >> 9,
- nr_ptrs, state);
-}
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
- u64 start, u64 end)
-{
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct folio_batch fbatch;
- unsigned i, j;
-
- if (end <= start)
- return;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(inode->v.i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
-
- BUG_ON(end <= folio_start);
-
- folio_lock(folio);
- s = bch2_folio(folio);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++)
- s->s[j].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
-
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-}
-
-int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 *start, u64 end,
- bool nonblocking)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct folio_batch fbatch;
- s64 i_sectors_delta = 0;
- int ret = 0;
-
- if (end <= *start)
- return 0;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(inode->v.i_mapping,
- &index, end_index, &fbatch)) {
- for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
-
- if (!nonblocking)
- folio_lock(folio);
- else if (!folio_trylock(folio)) {
- folio_batch_release(&fbatch);
- ret = -EAGAIN;
- break;
- }
-
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
-
- BUG_ON(end <= folio_start);
-
- *start = min(end, folio_end);
-
- struct bch_folio *s = bch2_folio(folio);
- if (s) {
- unsigned folio_offset = max(*start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-
- spin_lock(&s->lock);
- for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
- i_sectors_delta -= s->s[j].state == SECTOR_dirty;
- bch2_folio_sector_set(folio, s, j,
- folio_sector_reserve(s->s[j].state));
- }
- spin_unlock(&s->lock);
- }
-
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
- return ret;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
- unsigned nr_replicas)
-{
- return max(0, (int) nr_replicas -
- s->nr_replicas -
- s->replicas_reserved);
-}
-
-int bch2_get_folio_disk_reservation(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio, bool check_enospc)
-{
- struct bch_folio *s = bch2_folio_create(folio, 0);
- unsigned nr_replicas = inode_nr_replicas(c, inode);
- struct disk_reservation disk_res = { 0 };
- unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
- int ret;
-
- if (!s)
- return -ENOMEM;
-
- for (i = 0; i < sectors; i++)
- disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
- if (!disk_res_sectors)
- return 0;
-
- ret = bch2_disk_reservation_get(c, &disk_res,
- disk_res_sectors, 1,
- !check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL
- : 0);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < sectors; i++)
- s->s[i].replicas_reserved +=
- sectors_to_reserve(&s->s[i], nr_replicas);
-
- return 0;
-}
-
-void bch2_folio_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch2_folio_reservation *res)
-{
- bch2_disk_reservation_put(c, &res->disk);
- bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int __bch2_folio_reservation_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- size_t offset, size_t len,
- bool partial)
-{
- struct bch_folio *s = bch2_folio_create(folio, 0);
- unsigned i, disk_sectors = 0, quota_sectors = 0;
- struct disk_reservation disk_res = {};
- size_t reserved = len;
- int ret;
-
- if (!s)
- return -ENOMEM;
-
- BUG_ON(!s->uptodate);
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
- quota_sectors += s->s[i].state == SECTOR_unallocated;
- }
-
- if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
- partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(disk_res.sectors != disk_sectors)) {
- disk_sectors = quota_sectors = 0;
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
- if (disk_sectors > disk_res.sectors) {
- /*
- * Make sure to get a reservation that's
- * aligned to the filesystem blocksize:
- */
- unsigned reserved_offset = round_down(i << 9, block_bytes(c));
- reserved = clamp(reserved_offset, offset, offset + len) - offset;
-
- if (!reserved) {
- bch2_disk_reservation_put(c, &disk_res);
- return bch_err_throw(c, ENOSPC_disk_reservation);
- }
- break;
- }
- quota_sectors += s->s[i].state == SECTOR_unallocated;
- }
- }
- }
-
- if (quota_sectors) {
- ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
- if (unlikely(ret)) {
- bch2_disk_reservation_put(c, &disk_res);
- return ret;
- }
- }
-
- res->disk.sectors += disk_res.sectors;
- return partial ? reserved : 0;
-}
-
-int bch2_folio_reservation_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- size_t offset, size_t len)
-{
- return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
-}
-
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- size_t offset, size_t len)
-{
- return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
- struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_folio *s = bch2_folio(folio);
- struct disk_reservation disk_res = { 0 };
- int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
- if (!s)
- return;
-
- EBUG_ON(!folio_test_locked(folio));
- EBUG_ON(folio_test_writeback(folio));
-
- for (i = 0; i < sectors; i++) {
- disk_res.sectors += s->s[i].replicas_reserved;
- s->s[i].replicas_reserved = 0;
-
- dirty_sectors -= s->s[i].state == SECTOR_dirty;
- bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
- }
-
- bch2_disk_reservation_put(c, &disk_res);
-
- bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
-
- bch2_folio_release(folio);
-}
-
-void bch2_set_folio_dirty(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, dirty_sectors = 0;
-
- WARN_ON((u64) folio_pos(folio) + offset + len >
- round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
- BUG_ON(!s->uptodate);
-
- spin_lock(&s->lock);
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- unsigned sectors = sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
-
- /*
- * This can happen if we race with the error path in
- * bch2_writepage_io_done():
- */
- sectors = min_t(unsigned, sectors, res->disk.sectors);
-
- s->s[i].replicas_reserved += sectors;
- res->disk.sectors -= sectors;
-
- dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
- bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
- }
-
- spin_unlock(&s->lock);
-
- bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
- if (!folio_test_dirty(folio))
- filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct address_space *fdm = faults_disabled_mapping();
- struct bch_inode_info *inode = file_bch_inode(file);
- vm_fault_t ret;
-
- if (fdm == mapping)
- return VM_FAULT_SIGBUS;
-
- /* Lock ordering: */
- if (fdm > mapping) {
- struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
- if (bch2_pagecache_add_tryget(inode))
- goto got_lock;
-
- bch2_pagecache_block_put(fdm_host);
-
- bch2_pagecache_add_get(inode);
- bch2_pagecache_add_put(inode);
-
- bch2_pagecache_block_get(fdm_host);
-
- /* Signal that lock has been dropped: */
- set_fdm_dropped_locks();
- return VM_FAULT_SIGBUS;
- }
-
- bch2_pagecache_add_get(inode);
-got_lock:
- ret = filemap_fault(vmf);
- bch2_pagecache_add_put(inode);
-
- return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
- struct folio *folio = page_folio(vmf->page);
- struct file *file = vmf->vma->vm_file;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = file->f_mapping;
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation res;
- vm_fault_t ret;
-
- loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
- unsigned offset = file_offset - folio_pos(folio);
- unsigned len = max(PAGE_SIZE, block_bytes(c));
-
- BUG_ON(offset + len > folio_size(folio));
-
- bch2_folio_reservation_init(c, inode, &res);
-
- sb_start_pagefault(inode->v.i_sb);
- file_update_time(file);
-
- /*
- * Not strictly necessary, but helps avoid dio writes livelocking in
- * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
- * a bch2_write_invalidate_inode_pages_range() that works without dropping
- * page lock before invalidating page
- */
- bch2_pagecache_add_get(inode);
-
- folio_lock(folio);
- u64 isize = i_size_read(&inode->v);
-
- if (folio->mapping != mapping || file_offset >= isize) {
- folio_unlock(folio);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- len = min_t(unsigned, len, isize - file_offset);
-
- if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
- bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) {
- folio_unlock(folio);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
- bch2_set_folio_dirty(c, inode, folio, &res, offset, len);
- bch2_folio_reservation_put(c, inode, &res);
-
- folio_wait_stable(folio);
- ret = VM_FAULT_LOCKED;
-out:
- bch2_pagecache_add_put(inode);
- sb_end_pagefault(inode->v.i_sb);
-
- return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
- if (offset || length < folio_size(folio))
- return;
-
- bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
- if (folio_test_dirty(folio) || folio_test_writeback(folio))
- return false;
-
- bch2_clear_folio_bits(folio);
- return true;
-}
-
-/* fseek: */
-
-static int folio_data_offset(struct folio *folio, loff_t pos,
- unsigned min_replicas)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, sectors = folio_sectors(folio);
-
- if (s)
- for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
- if (s->s[i].state >= SECTOR_dirty &&
- s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
- return i << SECTOR_SHIFT;
-
- return -1;
-}
-
-loff_t bch2_seek_pagecache_data(struct inode *vinode,
- loff_t start_offset,
- loff_t end_offset,
- unsigned min_replicas,
- bool nonblock)
-{
- struct folio_batch fbatch;
- pgoff_t start_index = start_offset >> PAGE_SHIFT;
- pgoff_t end_index = end_offset >> PAGE_SHIFT;
- pgoff_t index = start_index;
- unsigned i;
- loff_t ret;
- int offset;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(vinode->i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
-
- if (!nonblock) {
- folio_lock(folio);
- } else if (!folio_trylock(folio)) {
- folio_batch_release(&fbatch);
- return -EAGAIN;
- }
-
- offset = folio_data_offset(folio,
- max(folio_pos(folio), start_offset),
- min_replicas);
- if (offset >= 0) {
- ret = clamp(folio_pos(folio) + offset,
- start_offset, end_offset);
- folio_unlock(folio);
- folio_batch_release(&fbatch);
- return ret;
- }
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- return end_offset;
-}
-
-/*
- * Search for a hole in a folio.
- *
- * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
- * code to indicate a pagecache hole exists at the returned offset. Otherwise
- * return 0 if the folio is filled with data, or an error code. This function
- * can return -EAGAIN if nonblock is specified.
- */
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
- unsigned min_replicas, bool nonblock)
-{
- struct folio *folio;
- struct bch_folio *s;
- unsigned i, sectors;
- int ret = -ENOENT;
-
- folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
- FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
- if (IS_ERR(folio))
- return PTR_ERR(folio);
-
- s = bch2_folio(folio);
- if (!s)
- goto unlock;
-
- sectors = folio_sectors(folio);
- for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
- if (s->s[i].state < SECTOR_dirty ||
- s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
- *offset = max(*offset,
- folio_pos(folio) + (i << SECTOR_SHIFT));
- goto unlock;
- }
-
- *offset = folio_end_pos(folio);
- ret = 0;
-unlock:
- folio_unlock(folio);
- folio_put(folio);
- return ret;
-}
-
-loff_t bch2_seek_pagecache_hole(struct inode *vinode,
- loff_t start_offset,
- loff_t end_offset,
- unsigned min_replicas,
- bool nonblock)
-{
- struct address_space *mapping = vinode->i_mapping;
- loff_t offset = start_offset;
- loff_t ret = 0;
-
- while (!ret && offset < end_offset)
- ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
-
- if (ret && ret != -ENOENT)
- return ret;
- return min(offset, end_offset);
-}
-
-int bch2_clamp_data_hole(struct inode *inode,
- u64 *hole_start,
- u64 *hole_end,
- unsigned min_replicas,
- bool nonblock)
-{
- loff_t ret;
-
- ret = bch2_seek_pagecache_hole(inode,
- *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
- if (ret < 0)
- return ret;
-
- *hole_start = ret;
-
- if (*hole_start == *hole_end)
- return 0;
-
- ret = bch2_seek_pagecache_data(inode,
- *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
- if (ret < 0)
- return ret;
-
- *hole_end = ret;
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
deleted file mode 100644
index fad911cf5068..000000000000
--- a/fs/bcachefs/fs-io-pagecache.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
-#define _BCACHEFS_FS_IO_PAGECACHE_H
-
-#include <linux/pagemap.h>
-
-typedef DARRAY(struct folio *) folios;
-
-int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
- u64, fgf_t, gfp_t, folios *);
-int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
- return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
- return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
- return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
- return folio_end_pos(folio) >> 9;
-}
-
-#define BCH_FOLIO_SECTOR_STATE() \
- x(unallocated) \
- x(reserved) \
- x(dirty) \
- x(dirty_reserved) \
- x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n) SECTOR_##n,
- BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-struct bch_folio_sector {
- /* Uncompressed, fully allocated replicas (or on disk reservation): */
- u8 nr_replicas:4,
- /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- replicas_reserved:4;
- u8 state;
-};
-
-struct bch_folio {
- spinlock_t lock;
- atomic_t write_count;
- /*
- * Is the sector state up to date with the btree?
- * (Not the data itself)
- */
- bool uptodate;
- struct bch_folio_sector s[];
-};
-
-/* Helper for when we need to add debug instrumentation: */
-static inline void bch2_folio_sector_set(struct folio *folio,
- struct bch_folio *s,
- unsigned i, unsigned n)
-{
- s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
- u64 f_offset = pos - folio_pos(folio);
-
- BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
- return f_offset >> SECTOR_SHIFT;
-}
-
-/* for newly allocated folios: */
-static inline void __bch2_folio_release(struct folio *folio)
-{
- kfree(folio_detach_private(folio));
-}
-
-static inline void bch2_folio_release(struct folio *folio)
-{
- EBUG_ON(!folio_test_locked(folio));
- __bch2_folio_release(folio);
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
- return folio_get_private(folio);
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
- EBUG_ON(!folio_test_locked(folio));
-
- return __bch2_folio(folio);
-}
-
-struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
-struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
-
-struct bch2_folio_reservation {
- struct disk_reservation disk;
- struct quota_res quota;
-};
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
- /* XXX: this should not be open coded */
- return inode->ei_inode.bi_data_replicas
- ? inode->ei_inode.bi_data_replicas - 1
- : c->opts.data_replicas;
-}
-
-static inline void bch2_folio_reservation_init(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch2_folio_reservation *res)
-{
- memset(res, 0, sizeof(*res));
-
- res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
-void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
-
-void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
-
-int bch2_get_folio_disk_reservation(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *, bool);
-
-void bch2_folio_reservation_put(struct bch_fs *,
- struct bch_inode_info *,
- struct bch2_folio_reservation *);
-int bch2_folio_reservation_get(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *,
- struct bch2_folio_reservation *,
- size_t, size_t);
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *,
- struct bch2_folio_reservation *,
- size_t, size_t);
-
-void bch2_set_folio_dirty(struct bch_fs *,
- struct bch_inode_info *,
- struct folio *,
- struct bch2_folio_reservation *,
- unsigned, unsigned);
-
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
-loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
-loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
-int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
deleted file mode 100644
index a233f45875e9..000000000000
--- a/fs/bcachefs/fs-io.c
+++ /dev/null
@@ -1,1102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "quota.h"
-#include "reflink.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/falloc.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/pagevec.h>
-#include <linux/rmap.h>
-#include <linux/sched/signal.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/uio.h>
-
-#include <trace/events/writeback.h>
-
-struct nocow_flush {
- struct closure *cl;
- struct bch_dev *ca;
- struct bio bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
- struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
- closure_put(bio->cl);
- enumerated_ref_put(&bio->ca->io_ref[WRITE],
- BCH_DEV_WRITE_REF_nocow_flush);
- bio_put(&bio->bio);
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct closure *cl)
-{
- struct nocow_flush *bio;
- struct bch_dev *ca;
- struct bch_devs_mask devs;
- unsigned dev;
-
- dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
- if (dev == BCH_SB_MEMBERS_MAX)
- return;
-
- devs = inode->ei_devs_need_flush;
- memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
- for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
- scoped_guard(rcu) {
- ca = rcu_dereference(c->devs[dev]);
- if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
- BCH_DEV_WRITE_REF_nocow_flush))
- ca = NULL;
- }
-
- if (!ca)
- continue;
-
- bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
- REQ_OP_WRITE|REQ_PREFLUSH,
- GFP_KERNEL,
- &c->nocow_flush_bioset),
- struct nocow_flush, bio);
- bio->cl = cl;
- bio->ca = ca;
- bio->bio.bi_end_io = nocow_flush_endio;
- closure_bio_submit(&bio->bio, cl);
- }
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct closure cl;
-
- closure_init_stack(&cl);
- bch2_inode_flush_nocow_writes_async(c, inode, &cl);
- closure_sync(&cl);
-
- return 0;
-}
-
-/* i_size updates: */
-
-struct inode_new_size {
- loff_t new_size;
- u64 now;
- unsigned fields;
-};
-
-static int inode_set_size(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct inode_new_size *s = p;
-
- bi->bi_size = s->new_size;
- if (s->fields & ATTR_ATIME)
- bi->bi_atime = s->now;
- if (s->fields & ATTR_MTIME)
- bi->bi_mtime = s->now;
- if (s->fields & ATTR_CTIME)
- bi->bi_ctime = s->now;
-
- return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
- struct bch_inode_info *inode,
- loff_t new_size, unsigned fields)
-{
- struct inode_new_size s = {
- .new_size = new_size,
- .now = bch2_current_time(c),
- .fields = fields,
- };
-
- return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, s64 sectors)
-{
- if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
- inode->ei_inode.bi_sectors);
-
- bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- if (sectors < 0)
- sectors = -inode->v.i_blocks;
- else
- sectors = 0;
- }
-
- inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
- if (quota_res &&
- !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
- sectors > 0) {
- BUG_ON(sectors > quota_res->sectors);
- BUG_ON(sectors > inode->ei_quota_reserved);
-
- quota_res->sectors -= sectors;
- inode->ei_quota_reserved -= sectors;
- } else {
- bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
- }
-#endif
-}
-
-/* fsync: */
-
-static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum,
- u64 *seq)
-{
- struct printbuf buf = PRINTBUF;
- struct bch_inode_unpacked u;
- struct btree_iter iter;
- int ret = bch2_inode_peek(trans, &iter, &u, inum, 0);
- if (ret)
- return ret;
-
- u64 cur_seq = journal_cur_seq(&trans->c->journal);
- *seq = min(cur_seq, u.bi_journal_seq);
-
- if (fsck_err_on(u.bi_journal_seq > cur_seq,
- trans, inode_journal_seq_in_future,
- "inode journal seq in future (currently at %llu)\n%s",
- cur_seq,
- (bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- u.bi_journal_seq = cur_seq;
- ret = bch2_inode_write(trans, &iter, &u);
- }
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
- * insert trigger: look up the btree inode instead
- */
-static int bch2_flush_inode(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- if (c->opts.journal_flush_disabled)
- return 0;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
- return -EROFS;
-
- u64 seq;
- int ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
- bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
- bch2_inode_flush_nocow_writes(c, inode);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync);
- return ret;
-}
-
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, err;
-
- trace_bch2_fsync(file, datasync);
-
- ret = file_write_and_wait_range(file, start, end);
- if (ret)
- goto out;
- ret = sync_inode_metadata(&inode->v, 1);
- if (ret)
- goto out;
- ret = bch2_flush_inode(c, inode);
-out:
- ret = bch2_err_class(ret);
- if (ret == -EROFS)
- ret = -EIO;
-
- err = file_check_and_advance_wb_err(file);
- if (!ret)
- ret = err;
-
- return ret;
-}
-
-/* truncate: */
-
-static inline int range_has_data(struct bch_fs *c, u32 subvol,
- struct bpos start,
- struct bpos end)
-{
- return bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end,
- subvol, 0, k, ({
- bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
- })));
-}
-
-static int __bch2_truncate_folio(struct bch_inode_info *inode,
- pgoff_t index, loff_t start, loff_t end)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
- struct bch_folio *s;
- unsigned start_offset;
- unsigned end_offset;
- unsigned i;
- struct folio *folio;
- s64 i_sectors_delta = 0;
- int ret = 0;
- u64 end_pos;
-
- folio = filemap_lock_folio(mapping, index);
- if (IS_ERR_OR_NULL(folio)) {
- /*
- * XXX: we're doing two index lookups when we end up reading the
- * folio
- */
- ret = range_has_data(c, inode->ei_inum.subvol,
- POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
- POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
- if (ret <= 0)
- return ret;
-
- folio = __filemap_get_folio(mapping, index,
- FGP_LOCK|FGP_CREAT, GFP_KERNEL);
- if (IS_ERR(folio)) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- BUG_ON(start >= folio_end_pos(folio));
- BUG_ON(end <= folio_pos(folio));
-
- start_offset = max(start, folio_pos(folio)) - folio_pos(folio);
- end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
-
- /* Folio boundary? Nothing to do */
- if (start_offset == 0 &&
- end_offset == folio_size(folio)) {
- ret = 0;
- goto unlock;
- }
-
- s = bch2_folio_create(folio, 0);
- if (!s) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- if (!folio_test_uptodate(folio)) {
- ret = bch2_read_single_folio(folio, mapping);
- if (ret)
- goto unlock;
- }
-
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto unlock;
-
- for (i = round_up(start_offset, block_bytes(c)) >> 9;
- i < round_down(end_offset, block_bytes(c)) >> 9;
- i++) {
- s->s[i].nr_replicas = 0;
-
- i_sectors_delta -= s->s[i].state == SECTOR_dirty;
- bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
- }
-
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- /*
- * Caller needs to know whether this folio will be written out by
- * writeback - doing an i_size update if necessary - or whether it will
- * be responsible for the i_size update.
- *
- * Note that we shouldn't ever see a folio beyond EOF, but check and
- * warn if so. This has been observed by failure to clean up folios
- * after a short write and there's still a chance reclaim will fix
- * things up.
- */
- WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
- end_pos = folio_end_pos(folio);
- if (inode->v.i_size > folio_pos(folio))
- end_pos = min_t(u64, inode->v.i_size, end_pos);
- ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
-
- folio_zero_segment(folio, start_offset, end_offset);
-
- /*
- * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
- *
- * XXX: because we aren't currently tracking whether the folio has actual
- * data in it (vs. just 0s, or only partially written) this wrong. ick.
- */
- BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
-
- /*
- * This removes any writeable userspace mappings; we need to force
- * .page_mkwrite to be called again before any mmapped writes, to
- * redirty the full page:
- */
- folio_mkclean(folio);
- filemap_dirty_folio(mapping, folio);
-unlock:
- folio_unlock(folio);
- folio_put(folio);
-out:
- return ret;
-}
-
-static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
-{
- return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
- from, ANYSINT_MAX(loff_t));
-}
-
-static int bch2_truncate_folios(struct bch_inode_info *inode,
- loff_t start, loff_t end)
-{
- int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
- start, end);
-
- if (ret >= 0 &&
- start >> PAGE_SHIFT != end >> PAGE_SHIFT)
- ret = __bch2_truncate_folio(inode,
- (end - 1) >> PAGE_SHIFT,
- start, end);
- return ret;
-}
-
-static int bch2_extend(struct mnt_idmap *idmap,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *inode_u,
- struct iattr *iattr)
-{
- struct address_space *mapping = inode->v.i_mapping;
- int ret;
-
- /*
- * sync appends:
- *
- * this has to be done _before_ extending i_size:
- */
- ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
- if (ret)
- return ret;
-
- truncate_setsize(&inode->v, iattr->ia_size);
-
- return bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-int bchfs_truncate(struct mnt_idmap *idmap,
- struct bch_inode_info *inode, struct iattr *iattr)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
- struct bch_inode_unpacked inode_u;
- s64 i_sectors_delta = 0;
- int ret = 0;
-
- /*
- * If the truncate call with change the size of the file, the
- * cmtimes should be updated. If the size will not change, we
- * do not need to update the cmtimes.
- */
- if (iattr->ia_size != inode->v.i_size) {
- if (!(iattr->ia_valid & ATTR_MTIME))
- ktime_get_coarse_real_ts64(&iattr->ia_mtime);
- if (!(iattr->ia_valid & ATTR_CTIME))
- ktime_get_coarse_real_ts64(&iattr->ia_ctime);
- iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
- }
-
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(inode);
-
- ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
- if (ret)
- goto err;
-
- /*
- * check this before next assertion; on filesystem error our normal
- * invariants are a bit broken (truncate has to truncate the page cache
- * before the inode).
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- goto err;
-
- WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
- inode->v.i_size < inode_u.bi_size,
- "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
- (u64) inode->v.i_size, inode_u.bi_size);
-
- if (iattr->ia_size > inode->v.i_size) {
- ret = bch2_extend(idmap, inode, &inode_u, iattr);
- goto err;
- }
-
- iattr->ia_valid &= ~ATTR_SIZE;
-
- ret = bch2_truncate_folio(inode, iattr->ia_size);
- if (unlikely(ret < 0))
- goto err;
- ret = 0;
-
- truncate_setsize(&inode->v, iattr->ia_size);
-
- /*
- * When extending, we're going to write the new i_size to disk
- * immediately so we need to flush anything above the current on disk
- * i_size first:
- *
- * Also, when extending we need to flush the page that i_size currently
- * straddles - if it's mapped to userspace, we need to ensure that
- * userspace has to redirty it and call .mkwrite -> set_page_dirty
- * again to allocate the part of the page that was extended.
- */
- if (iattr->ia_size > inode_u.bi_size)
- ret = filemap_write_and_wait_range(mapping,
- inode_u.bi_size,
- iattr->ia_size - 1);
- else if (iattr->ia_size & (PAGE_SIZE - 1))
- ret = filemap_write_and_wait_range(mapping,
- round_down(iattr->ia_size, PAGE_SIZE),
- iattr->ia_size - 1);
- if (ret)
- goto err;
-
- ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- if (unlikely(ret)) {
- /*
- * If we error here, VFS caches are now inconsistent with btree
- */
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
- goto err;
- }
-
- if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
- !bch2_journal_error(&c->journal))) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf,
- "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks,
- inode->ei_inode.bi_sectors);
-
- bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_setattr_nonsize(idmap, inode, iattr);
-err:
- bch2_pagecache_block_put(inode);
- return bch2_err_class(ret);
-}
-
-/* fallocate: */
-
-static int inode_update_times_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi, void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
- return 0;
-}
-
-static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 end = offset + len;
- u64 block_start = round_up(offset, block_bytes(c));
- u64 block_end = round_down(end, block_bytes(c));
- bool truncated_last_page;
- int ret = 0;
-
- ret = bch2_truncate_folios(inode, offset, end);
- if (unlikely(ret < 0))
- goto err;
-
- truncated_last_page = ret;
-
- truncate_pagecache_range(&inode->v, offset, end - 1);
-
- if (block_start < block_end) {
- s64 i_sectors_delta = 0;
-
- ret = bch2_fpunch(c, inode_inum(inode),
- block_start >> 9, block_end >> 9,
- &i_sectors_delta);
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
- }
-
- mutex_lock(&inode->ei_update_lock);
- if (end >= inode->v.i_size && !truncated_last_page) {
- ret = bch2_write_inode_size(c, inode, inode->v.i_size,
- ATTR_MTIME|ATTR_CTIME);
- } else {
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_MTIME|ATTR_CTIME);
- }
- mutex_unlock(&inode->ei_update_lock);
-err:
- return ret;
-}
-
-static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
- loff_t offset, loff_t len,
- bool insert)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
- s64 i_sectors_delta = 0;
- int ret = 0;
-
- if ((offset | len) & (block_bytes(c) - 1))
- return -EINVAL;
-
- if (insert) {
- if (offset >= inode->v.i_size)
- return -EINVAL;
- } else {
- if (offset + len >= inode->v.i_size)
- return -EINVAL;
- }
-
- ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
- if (ret)
- return ret;
-
- if (insert)
- i_size_write(&inode->v, inode->v.i_size + len);
-
- ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
- insert, &i_sectors_delta);
- if (!ret && !insert)
- i_size_write(&inode->v, inode->v.i_size - len);
- bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- return ret;
-}
-
-static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
- u64 start_sector, u64 end_sector)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bpos end_pos = POS(inode->v.i_ino, end_sector);
- struct bch_io_opts opts;
- int ret = 0;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inode->v.i_ino, start_sector),
- BTREE_ITER_slots|BTREE_ITER_intent);
-
- while (!ret) {
- s64 i_sectors_delta = 0;
- struct quota_res quota_res = { 0 };
- struct bkey_s_c k;
- unsigned sectors;
- bool is_allocation;
- u64 hole_start, hole_end;
- u32 snapshot;
-
- bch2_trans_begin(trans);
-
- if (bkey_ge(iter.pos, end_pos))
- break;
-
- ret = bch2_subvolume_get_snapshot(trans,
- inode->ei_inum.subvol, &snapshot);
- if (ret)
- goto bkey_err;
-
- bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
- k = bch2_btree_iter_peek_slot(trans, &iter);
- if ((ret = bkey_err(k)))
- goto bkey_err;
-
- hole_start = iter.pos.offset;
- hole_end = bpos_min(k.k->p, end_pos).offset;
- is_allocation = bkey_extent_is_allocation(k.k);
-
- /* already reserved */
- if (bkey_extent_is_reservation(k) &&
- bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
- bch2_btree_iter_advance(trans, &iter);
- continue;
- }
-
- if (bkey_extent_is_data(k.k) &&
- !(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance(trans, &iter);
- continue;
- }
-
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- /*
- * Lock ordering - can't be holding btree locks while
- * blocking on a folio lock:
- */
- if (bch2_clamp_data_hole(&inode->v,
- &hole_start,
- &hole_end,
- opts.data_replicas, true)) {
- ret = drop_locks_do(trans,
- (bch2_clamp_data_hole(&inode->v,
- &hole_start,
- &hole_end,
- opts.data_replicas, false), 0));
- if (ret)
- goto bkey_err;
- }
- bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
-
- if (ret)
- goto bkey_err;
-
- if (hole_start == hole_end)
- continue;
- }
-
- sectors = hole_end - hole_start;
-
- if (!is_allocation) {
- ret = bch2_quota_reservation_add(c, inode,
- &quota_res, sectors, true);
- if (unlikely(ret))
- goto bkey_err;
- }
-
- ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
- sectors, opts, &i_sectors_delta,
- writepoint_hashed((unsigned long) current));
- if (ret)
- goto bkey_err;
-
- bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
-
- if (bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, true)) {
- ret = drop_locks_do(trans,
- bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, false));
- if (ret)
- goto bkey_err;
- }
-bkey_err:
- bch2_quota_reservation_put(c, inode, &quota_res);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- }
-
- if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
- struct quota_res quota_res = { 0 };
- s64 i_sectors_delta = 0;
-
- bch2_fpunch_at(trans, &iter, inode_inum(inode),
- end_sector, &i_sectors_delta);
- bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- bch2_quota_reservation_put(c, inode, &quota_res);
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
- loff_t offset, loff_t len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 end = offset + len;
- u64 block_start = round_down(offset, block_bytes(c));
- u64 block_end = round_up(end, block_bytes(c));
- bool truncated_last_page = false;
- int ret, ret2 = 0;
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
- ret = inode_newsize_ok(&inode->v, end);
- if (ret)
- return ret;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = bch2_truncate_folios(inode, offset, end);
- if (unlikely(ret < 0))
- return ret;
-
- truncated_last_page = ret;
-
- truncate_pagecache_range(&inode->v, offset, end - 1);
-
- block_start = round_up(offset, block_bytes(c));
- block_end = round_down(end, block_bytes(c));
- }
-
- ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-
- /*
- * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
- * so that the VFS cache i_size is consistent with the btree i_size:
- */
- if (ret &&
- !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
- return ret;
-
- if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
- end = inode->v.i_size;
-
- if (end >= inode->v.i_size &&
- (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
- !(mode & FALLOC_FL_KEEP_SIZE))) {
- spin_lock(&inode->v.i_lock);
- i_size_write(&inode->v, end);
- spin_unlock(&inode->v.i_lock);
-
- mutex_lock(&inode->ei_update_lock);
- ret2 = bch2_write_inode_size(c, inode, end, 0);
- mutex_unlock(&inode->ei_update_lock);
- }
-
- return ret ?: ret2;
-}
-
-long bch2_fallocate_dispatch(struct file *file, int mode,
- loff_t offset, loff_t len)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- long ret;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate))
- return -EROFS;
-
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(inode);
-
- ret = file_modified(file);
- if (ret)
- goto err;
-
- if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
- ret = bchfs_fallocate(inode, mode, offset, len);
- else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
- ret = bchfs_fpunch(inode, offset, len);
- else if (mode == FALLOC_FL_INSERT_RANGE)
- ret = bchfs_fcollapse_finsert(inode, offset, len, true);
- else if (mode == FALLOC_FL_COLLAPSE_RANGE)
- ret = bchfs_fcollapse_finsert(inode, offset, len, false);
- else
- ret = -EOPNOTSUPP;
-err:
- bch2_pagecache_block_put(inode);
- inode_unlock(&inode->v);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate);
-
- return bch2_err_class(ret);
-}
-
-/*
- * Take a quota reservation for unallocated blocks in a given file range
- * Does not check pagecache
- */
-static int quota_reserve_range(struct bch_inode_info *inode,
- struct quota_res *res,
- u64 start, u64 end)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 sectors = end - start;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter,
- BTREE_ID_extents,
- POS(inode->v.i_ino, start),
- POS(inode->v.i_ino, end - 1),
- inode->ei_inum.subvol, 0, k, ({
- if (bkey_extent_is_allocation(k.k)) {
- u64 s = min(end, k.k->p.offset) -
- max(start, bkey_start_offset(k.k));
- BUG_ON(s > sectors);
- sectors -= s;
- }
-
- 0;
- })));
-
- return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
-}
-
-loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- loff_t len, unsigned remap_flags)
-{
- struct bch_inode_info *src = file_bch_inode(file_src);
- struct bch_inode_info *dst = file_bch_inode(file_dst);
- struct bch_fs *c = src->v.i_sb->s_fs_info;
- struct quota_res quota_res = { 0 };
- s64 i_sectors_delta = 0;
- u64 aligned_len;
- loff_t ret = 0;
-
- if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
- return -EINVAL;
-
- if ((pos_src & (block_bytes(c) - 1)) ||
- (pos_dst & (block_bytes(c) - 1)))
- return -EINVAL;
-
- if (src == dst &&
- abs(pos_src - pos_dst) < len)
- return -EINVAL;
-
- lock_two_nondirectories(&src->v, &dst->v);
- bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
-
- inode_dio_wait(&src->v);
- inode_dio_wait(&dst->v);
-
- ret = generic_remap_file_range_prep(file_src, pos_src,
- file_dst, pos_dst,
- &len, remap_flags);
- if (ret < 0 || len == 0)
- goto err;
-
- aligned_len = round_up((u64) len, block_bytes(c));
-
- ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
- pos_dst, pos_dst + len - 1);
- if (ret)
- goto err;
-
- ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
- (pos_dst + aligned_len) >> 9);
- if (ret)
- goto err;
-
- if (!(remap_flags & REMAP_FILE_DEDUP))
- file_update_time(file_dst);
-
- bch2_mark_pagecache_unallocated(src, pos_src >> 9,
- (pos_src + aligned_len) >> 9);
-
- /*
- * XXX: we'd like to be telling bch2_remap_range() if we have
- * permission to write to the source file, and thus if io path option
- * changes should be propagated through the copy, but we need mnt_idmap
- * from the pathwalk, awkward
- */
- ret = bch2_remap_range(c,
- inode_inum(dst), pos_dst >> 9,
- inode_inum(src), pos_src >> 9,
- aligned_len >> 9,
- pos_dst + len, &i_sectors_delta,
- false);
- if (ret < 0)
- goto err;
-
- /*
- * due to alignment, we might have remapped slightly more than requsted
- */
- ret = min((u64) ret << 9, (u64) len);
-
- bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
-
- spin_lock(&dst->v.i_lock);
- if (pos_dst + ret > dst->v.i_size)
- i_size_write(&dst->v, pos_dst + ret);
- spin_unlock(&dst->v.i_lock);
-
- if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
- IS_SYNC(file_inode(file_dst)))
- ret = bch2_flush_inode(c, dst);
-err:
- bch2_quota_reservation_put(c, dst, &quota_res);
- bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
- unlock_two_nondirectories(&src->v, &dst->v);
-
- return bch2_err_class(ret);
-}
-
-/* fseek: */
-
-static loff_t bch2_seek_data(struct file *file, u64 offset)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- subvol_inum inum = inode_inum(inode);
- u64 isize, next_data = MAX_LFS_FILESIZE;
-
- isize = i_size_read(&inode->v);
- if (offset >= isize)
- return -ENXIO;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
- POS(inode->v.i_ino, U64_MAX),
- inum.subvol, 0, k, ({
- if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- 0;
- })));
- if (ret)
- return ret;
-
- if (next_data > offset)
- next_data = bch2_seek_pagecache_data(&inode->v,
- offset, next_data, 0, false);
-
- if (next_data >= isize)
- return -ENXIO;
-
- return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-}
-
-static loff_t bch2_seek_hole(struct file *file, u64 offset)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- subvol_inum inum = inode_inum(inode);
- u64 isize, next_hole = MAX_LFS_FILESIZE;
-
- isize = i_size_read(&inode->v);
- if (offset >= isize)
- return -ENXIO;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
- POS(inode->v.i_ino, U64_MAX),
- inum.subvol, BTREE_ITER_slots, k, ({
- if (k.k->p.inode != inode->v.i_ino ||
- !bkey_extent_is_data(k.k)) {
- loff_t start_offset = k.k->p.inode == inode->v.i_ino
- ? max(offset, bkey_start_offset(k.k) << 9)
- : offset;
- loff_t end_offset = k.k->p.inode == inode->v.i_ino
- ? MAX_LFS_FILESIZE
- : k.k->p.offset << 9;
-
- /*
- * Found a hole in the btree, now make sure it's
- * a hole in the pagecache. We might have to
- * keep searching if this hole is entirely dirty
- * in the page cache:
- */
- bch2_trans_unlock(trans);
- loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v,
- start_offset, end_offset, 0, false);
- if (pagecache_hole < end_offset) {
- next_hole = pagecache_hole;
- break;
- }
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- 0;
- })));
- if (ret)
- return ret;
-
- if (next_hole > isize)
- next_hole = isize;
-
- return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
-}
-
-loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
-{
- loff_t ret;
-
- switch (whence) {
- case SEEK_SET:
- case SEEK_CUR:
- case SEEK_END:
- ret = generic_file_llseek(file, offset, whence);
- break;
- case SEEK_DATA:
- ret = bch2_seek_data(file, offset);
- break;
- case SEEK_HOLE:
- ret = bch2_seek_hole(file, offset);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return bch2_err_class(ret);
-}
-
-void bch2_fs_fsio_exit(struct bch_fs *c)
-{
- bioset_exit(&c->nocow_flush_bioset);
-}
-
-int bch2_fs_fsio_init(struct bch_fs *c)
-{
- if (bioset_init(&c->nocow_flush_bioset,
- 1, offsetof(struct nocow_flush, bio), 0))
- return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
-
- return 0;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
deleted file mode 100644
index ca70346e68dc..000000000000
--- a/fs/bcachefs/fs-io.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IO_H
-#define _BCACHEFS_FS_IO_H
-
-#ifndef NO_BCACHEFS_FS
-
-#include "buckets.h"
-#include "fs.h"
-#include "io_write_types.h"
-#include "quota.h"
-
-#include <linux/uio.h>
-
-struct folio_vec {
- struct folio *fv_folio;
- size_t fv_offset;
- size_t fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
- struct folio *folio = page_folio(bv.bv_page);
- size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
- bv.bv_offset;
- size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
- return (struct folio_vec) {
- .fv_folio = folio,
- .fv_offset = offset,
- .fv_len = len,
- };
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
- struct bvec_iter iter)
-{
- return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
- bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter) \
- __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-struct quota_res {
- u64 sectors;
-};
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
-{
- BUG_ON(res->sectors > inode->ei_quota_reserved);
-
- bch2_quota_acct(c, inode->ei_qid, Q_SPC,
- -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
- inode->ei_quota_reserved -= res->sectors;
- res->sectors = 0;
-}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
-{
- if (res->sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_quota_reservation_put(c, inode, res);
- mutex_unlock(&inode->ei_quota_lock);
- }
-}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res,
- u64 sectors,
- bool check_enospc)
-{
- int ret;
-
- if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
- return 0;
-
- mutex_lock(&inode->ei_quota_lock);
- ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
- check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
- if (likely(!ret)) {
- inode->ei_quota_reserved += sectors;
- res->sectors += sectors;
- }
- mutex_unlock(&inode->ei_quota_lock);
-
- return ret;
-}
-
-#else
-
-static inline void __bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res) {}
-
-static inline void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res) {}
-
-static inline int bch2_quota_reservation_add(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res,
- unsigned sectors,
- bool check_enospc)
-{
- return 0;
-}
-
-#endif
-
-void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
- struct quota_res *, s64);
-
-static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, s64 sectors)
-{
- if (sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_i_sectors_acct(c, inode, quota_res, sectors);
- mutex_unlock(&inode->ei_quota_lock);
- }
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
- return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
- current->faults_disabled_mapping =
- (void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
- return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
- struct bch_inode_info *, struct closure *);
-
-int __must_check bch2_write_inode_size(struct bch_fs *,
- struct bch_inode_info *,
- loff_t, unsigned);
-
-int bch2_fsync(struct file *, loff_t, loff_t, int);
-
-int bchfs_truncate(struct mnt_idmap *,
- struct bch_inode_info *, struct iattr *);
-long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-
-loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
- loff_t, loff_t, unsigned);
-
-loff_t bch2_llseek(struct file *, loff_t, int);
-
-void bch2_fs_fsio_exit(struct bch_fs *);
-int bch2_fs_fsio_init(struct bch_fs *);
-#else
-static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
-static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
-#endif
-
-#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
deleted file mode 100644
index 4e72e654da96..000000000000
--- a/fs/bcachefs/fs-ioctl.c
+++ /dev/null
@@ -1,442 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "fs.h"
-#include "fs-ioctl.h"
-#include "namei.h"
-#include "quota.h"
-
-#include <linux/compat.h>
-#include <linux/fsnotify.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include <linux/writeback.h>
-
-#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
-#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
-#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
-#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
-
-static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_inode_info *dir = p;
-
- return !bch2_reinherit_attrs(bi, &dir->ei_inode);
-}
-
-static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *src,
- const char __user *name)
-{
- struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
- struct bch_inode_info *dst;
- struct inode *vinode = NULL;
- char *kname = NULL;
- struct qstr qstr;
- int ret = 0;
- subvol_inum inum;
-
- kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
- if (!kname)
- return -ENOMEM;
-
- ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
- if (unlikely(ret < 0))
- goto err1;
-
- qstr.len = ret;
- qstr.name = kname;
-
- ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
- if (ret)
- goto err1;
-
- vinode = bch2_vfs_inode_get(c, inum);
- ret = PTR_ERR_OR_ZERO(vinode);
- if (ret)
- goto err1;
-
- dst = to_bch_ei(vinode);
-
- ret = mnt_want_write_file(file);
- if (ret)
- goto err2;
-
- bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
-
- if (inode_attr_changing(src, dst, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, dst,
- src->ei_qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err3;
- }
-
- ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
-err3:
- bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
-
- /* return true if we did work */
- if (ret >= 0)
- ret = !ret;
-
- mnt_drop_write_file(file);
-err2:
- iput(vinode);
-err1:
- kfree(kname);
-
- return ret;
-}
-
-static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
-{
- return put_user(inode->v.i_generation, arg);
-}
-
-static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
-{
- int ret;
- size_t len;
- char label[BCH_SB_LABEL_SIZE];
-
- BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
-
- mutex_lock(&c->sb_lock);
- memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
- mutex_unlock(&c->sb_lock);
-
- len = strnlen(label, BCH_SB_LABEL_SIZE);
- if (len == BCH_SB_LABEL_SIZE) {
- bch_warn(c,
- "label is too long, return the first %zu bytes",
- --len);
- }
-
- ret = copy_to_user(user_label, label, len);
-
- return ret ? -EFAULT : 0;
-}
-
-static int bch2_ioc_setlabel(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- const char __user *user_label)
-{
- int ret;
- char label[BCH_SB_LABEL_SIZE];
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(label, user_label, sizeof(label)))
- return -EFAULT;
-
- if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
- bch_err(c,
- "unable to set label with more than %d bytes",
- BCH_SB_LABEL_SIZE - 1);
- return -EINVAL;
- }
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- mutex_lock(&c->sb_lock);
- strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
- ret = bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- mnt_drop_write_file(file);
- return ret;
-}
-
-static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
-{
- u32 flags;
- int ret = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (get_user(flags, arg))
- return -EFAULT;
-
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "shutdown by ioctl type %u", flags);
-
- switch (flags) {
- case FSOP_GOING_FLAGS_DEFAULT:
- ret = bdev_freeze(c->vfs_sb->s_bdev);
- if (ret)
- break;
- bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only2(c, &buf);
- bdev_thaw(c->vfs_sb->s_bdev);
- break;
- case FSOP_GOING_FLAGS_LOGFLUSH:
- bch2_journal_flush(&c->journal);
- fallthrough;
- case FSOP_GOING_FLAGS_NOLOGFLUSH:
- bch2_fs_emergency_read_only2(c, &buf);
- break;
- default:
- ret = -EINVAL;
- goto noprint;
- }
-
- bch2_print_str(c, KERN_ERR, buf.buf);
-noprint:
- printbuf_exit(&buf);
- return ret;
-}
-
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- struct inode *dir;
- struct bch_inode_info *inode;
- struct user_namespace *s_user_ns;
- struct dentry *dst_dentry;
- struct path src_path, dst_path;
- int how = LOOKUP_FOLLOW;
- int error;
- subvol_inum snapshot_src = { 0 };
- unsigned lookup_flags = 0;
- unsigned create_flags = BCH_CREATE_SUBVOL;
-
- if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
- BCH_SUBVOL_SNAPSHOT_RO))
- return -EINVAL;
-
- if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
- (arg.src_ptr ||
- (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
- return -EINVAL;
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
- create_flags |= BCH_CREATE_SNAPSHOT;
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
- create_flags |= BCH_CREATE_SNAPSHOT_RO;
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
- /* sync_inodes_sb enforce s_umount is locked */
- down_read(&c->vfs_sb->s_umount);
- sync_inodes_sb(c->vfs_sb);
- up_read(&c->vfs_sb->s_umount);
- }
-
- if (arg.src_ptr) {
- error = user_path_at(arg.dirfd,
- (const char __user *)(unsigned long)arg.src_ptr,
- how, &src_path);
- if (error)
- goto err1;
-
- if (src_path.dentry->d_sb->s_fs_info != c) {
- path_put(&src_path);
- error = -EXDEV;
- goto err1;
- }
-
- snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
- }
-
- dst_dentry = user_path_create(arg.dirfd,
- (const char __user *)(unsigned long)arg.dst_ptr,
- &dst_path, lookup_flags);
- error = PTR_ERR_OR_ZERO(dst_dentry);
- if (error)
- goto err2;
-
- if (dst_dentry->d_sb->s_fs_info != c) {
- error = -EXDEV;
- goto err3;
- }
-
- if (dst_dentry->d_inode) {
- error = bch_err_throw(c, EEXIST_subvolume_create);
- goto err3;
- }
-
- dir = dst_path.dentry->d_inode;
- if (IS_DEADDIR(dir)) {
- error = bch_err_throw(c, ENOENT_directory_dead);
- goto err3;
- }
-
- s_user_ns = dir->i_sb->s_user_ns;
- if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
- !kgid_has_mapping(s_user_ns, current_fsgid())) {
- error = -EOVERFLOW;
- goto err3;
- }
-
- error = inode_permission(file_mnt_idmap(filp),
- dir, MAY_WRITE | MAY_EXEC);
- if (error)
- goto err3;
-
- if (!IS_POSIXACL(dir))
- arg.mode &= ~current_umask();
-
- error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
- if (error)
- goto err3;
-
- if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
- !arg.src_ptr)
- snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
-
- down_write(&c->snapshot_create_lock);
- inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
- dst_dentry, arg.mode|S_IFDIR,
- 0, snapshot_src, create_flags);
- up_write(&c->snapshot_create_lock);
-
- error = PTR_ERR_OR_ZERO(inode);
- if (error)
- goto err3;
-
- d_instantiate(dst_dentry, &inode->v);
- fsnotify_mkdir(dir, dst_dentry);
-err3:
- done_path_create(&dst_path, dst_dentry);
-err2:
- if (arg.src_ptr)
- path_put(&src_path);
-err1:
- return error;
-}
-
-static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
- struct path path;
- struct inode *dir;
- struct dentry *victim;
- int ret = 0;
-
- if (arg.flags)
- return -EINVAL;
-
- victim = user_path_locked_at(arg.dirfd, name, &path);
- if (IS_ERR(victim))
- return PTR_ERR(victim);
-
- dir = d_inode(path.dentry);
- if (victim->d_sb->s_fs_info != c) {
- ret = -EXDEV;
- goto err;
- }
-
- ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?:
- __bch2_unlink(dir, victim, true);
- if (!ret) {
- fsnotify_rmdir(dir, victim);
- d_invalidate(victim);
- }
-err:
- inode_unlock(dir);
- dput(victim);
- path_put(&path);
- return ret;
-}
-
-long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- long ret;
-
- switch (cmd) {
- case BCHFS_IOC_REINHERIT_ATTRS:
- ret = bch2_ioc_reinherit_attrs(c, file, inode,
- (void __user *) arg);
- break;
-
- case FS_IOC_GETVERSION:
- ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
- break;
-
- case FS_IOC_SETVERSION:
- ret = -ENOTTY;
- break;
-
- case FS_IOC_GETFSLABEL:
- ret = bch2_ioc_getlabel(c, (void __user *) arg);
- break;
-
- case FS_IOC_SETFSLABEL:
- ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
- break;
-
- case FS_IOC_GOINGDOWN:
- ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
- break;
-
- case BCH_IOCTL_SUBVOLUME_CREATE: {
- struct bch_ioctl_subvolume i;
-
- ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
- ? -EFAULT
- : bch2_ioctl_subvolume_create(c, file, i);
- break;
- }
-
- case BCH_IOCTL_SUBVOLUME_DESTROY: {
- struct bch_ioctl_subvolume i;
-
- ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
- ? -EFAULT
- : bch2_ioctl_subvolume_destroy(c, file, i);
- break;
- }
-
- default:
- ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
- break;
- }
-
- return bch2_err_class(ret);
-}
-
-#ifdef CONFIG_COMPAT
-long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- /* These are just misnamed, they actually get/put from/to user an int */
- switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
- case FS_IOC32_GETVERSION:
- cmd = FS_IOC_GETVERSION;
- break;
- case FS_IOC_GETFSLABEL:
- case FS_IOC_SETFSLABEL:
- break;
- default:
- return -ENOIOCTLCMD;
- }
- return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
deleted file mode 100644
index a657e4994b71..000000000000
--- a/fs/bcachefs/fs-ioctl.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_IOCTL_H
-#define _BCACHEFS_FS_IOCTL_H
-
-long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
-long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
-
-#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
deleted file mode 100644
index 687af0eea0c2..000000000000
--- a/fs/bcachefs/fs.c
+++ /dev/null
@@ -1,2768 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "errcode.h"
-#include "extents.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-ioctl.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fs-io-pagecache.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "journal.h"
-#include "keylist.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "snapshot.h"
-#include "super.h"
-#include "xattr.h"
-#include "trace.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/exportfs.h>
-#include <linux/fiemap.h>
-#include <linux/fileattr.h>
-#include <linux/fs_context.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/posix_acl.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/siphash.h>
-#include <linux/statfs.h>
-#include <linux/string.h>
-#include <linux/xattr.h>
-
-static struct kmem_cache *bch2_inode_cache;
-
-static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
- struct bch_inode_info *,
- struct bch_inode_unpacked *,
- struct bch_subvolume *);
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
-{
- static const __maybe_unused unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_sync] = S_SYNC,
- [__BCH_INODE_immutable] = S_IMMUTABLE,
- [__BCH_INODE_append] = S_APPEND,
- [__BCH_INODE_noatime] = S_NOATIME,
- };
-
- set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-
- if (bch2_inode_casefold(c, &inode->ei_inode))
- inode->v.i_flags |= S_CASEFOLD;
- else
- inode->v.i_flags &= ~S_CASEFOLD;
-}
-
-void bch2_inode_update_after_write(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- unsigned fields)
-{
- struct bch_fs *c = trans->c;
-
- BUG_ON(bi->bi_inum != inode->v.i_ino);
-
- bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
-
- set_nlink(&inode->v, bch2_inode_nlink_get(bi));
- i_uid_write(&inode->v, bi->bi_uid);
- i_gid_write(&inode->v, bi->bi_gid);
- inode->v.i_mode = bi->bi_mode;
-
- if (fields & ATTR_SIZE)
- i_size_write(&inode->v, bi->bi_size);
-
- if (fields & ATTR_ATIME)
- inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
- if (fields & ATTR_MTIME)
- inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
- if (fields & ATTR_CTIME)
- inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
-
- inode->ei_inode = *bi;
-
- bch2_inode_flags_to_vfs(c, inode);
-}
-
-int __must_check bch2_write_inode(struct bch_fs *c,
- struct bch_inode_info *inode,
- inode_set_fn set,
- void *p, unsigned fields)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = {};
- struct bch_inode_unpacked inode_u;
- int ret;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent);
- if (ret)
- goto err;
-
- struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u);
-
- ret = (set ? set(trans, inode, &inode_u, p) : 0);
- if (ret)
- goto err;
-
- struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
- bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
-
- if (rebalance_changed) {
- ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
- if (ret)
- goto err;
- }
-
- ret = bch2_inode_write(trans, &iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
- /*
- * the btree node lock protects inode->ei_inode, not ei_update_lock;
- * this is important for inode updates via bchfs_write_index_update
- */
- if (!ret)
- bch2_inode_update_after_write(trans, inode, &inode_u, fields);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (rebalance_changed)
- bch2_rebalance_wakeup(c);
-
- bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "%s: inode %llu:%llu not found when updating",
- bch2_err_str(ret),
- inode_inum(inode).subvol,
- inode_inum(inode).inum);
-
- bch2_trans_put(trans);
- return ret < 0 ? ret : 0;
-}
-
-int bch2_fs_quota_transfer(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch_qid new_qid,
- unsigned qtypes,
- enum quota_acct_mode mode)
-{
- unsigned i;
- int ret;
-
- qtypes &= enabled_qtypes(c);
-
- for (i = 0; i < QTYP_NR; i++)
- if (new_qid.q[i] == inode->ei_qid.q[i])
- qtypes &= ~(1U << i);
-
- if (!qtypes)
- return 0;
-
- mutex_lock(&inode->ei_quota_lock);
-
- ret = bch2_quota_transfer(c, qtypes, new_qid,
- inode->ei_qid,
- inode->v.i_blocks +
- inode->ei_quota_reserved,
- mode);
- if (!ret)
- for (i = 0; i < QTYP_NR; i++)
- if (qtypes & (1 << i))
- inode->ei_qid.q[i] = new_qid.q[i];
-
- mutex_unlock(&inode->ei_quota_lock);
-
- return ret;
-}
-
-static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
-{
- const subvol_inum *inum = data;
- siphash_key_t k = { .key[0] = seed };
-
- return siphash_2u64(inum->subvol, inum->inum, &k);
-}
-
-static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
-{
- const struct bch_inode_info *inode = data;
-
- return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
-}
-
-static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct bch_inode_info *inode = obj;
- const subvol_inum *v = arg->key;
-
- return !subvol_inum_eq(inode->ei_inum, *v);
-}
-
-static const struct rhashtable_params bch2_vfs_inodes_params = {
- .head_offset = offsetof(struct bch_inode_info, hash),
- .key_offset = offsetof(struct bch_inode_info, ei_inum),
- .key_len = sizeof(subvol_inum),
- .hashfn = bch2_vfs_inode_hash_fn,
- .obj_hashfn = bch2_vfs_inode_obj_hash_fn,
- .obj_cmpfn = bch2_vfs_inode_cmp_fn,
- .automatic_shrinking = true,
-};
-
-static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = {
- .head_offset = offsetof(struct bch_inode_info, by_inum_hash),
- .key_offset = offsetof(struct bch_inode_info, ei_inum.inum),
- .key_len = sizeof(u64),
- .automatic_shrinking = true,
-};
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
-{
- struct bch_fs *c = trans->c;
- struct rhltable *ht = &c->vfs_inodes_by_inum_table;
- u64 inum = p.offset;
- DARRAY(u32) subvols;
- int ret = 0;
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return false;
-
- darray_init(&subvols);
-restart_from_top:
-
- /*
- * Tweaked version of __rhashtable_lookup(); we need to get a list of
- * subvolumes in which the given inode number is open.
- *
- * For this to work, we don't include the subvolume ID in the key that
- * we hash - all inodes with the same inode number regardless of
- * subvolume will hash to the same slot.
- *
- * This will be less than ideal if the same file is ever open
- * simultaneously in many different snapshots:
- */
- rcu_read_lock();
- struct rhash_lock_head __rcu *const *bkt;
- struct rhash_head *he;
- unsigned int hash;
- struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht);
-restart:
- hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params);
- bkt = rht_bucket(tbl, hash);
- do {
- struct bch_inode_info *inode;
-
- rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
- if (inode->ei_inum.inum == inum) {
- ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
- GFP_NOWAIT|__GFP_NOWARN);
- if (ret) {
- rcu_read_unlock();
- ret = darray_make_room(&subvols, 1);
- if (ret)
- goto err;
- subvols.nr = 0;
- goto restart_from_top;
- }
- }
- }
- /* An object might have been moved to a different hash chain,
- * while we walk along it - better check and retry.
- */
- } while (he != RHT_NULLS_MARKER(bkt));
-
- /* Ensure we see any new tables. */
- smp_rmb();
-
- tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht);
- if (unlikely(tbl))
- goto restart;
- rcu_read_unlock();
-
- darray_for_each(subvols, i) {
- u32 snap;
- ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
- if (ret)
- goto err;
-
- ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
- if (ret)
- break;
- }
-err:
- darray_exit(&subvols);
- return ret;
-}
-
-static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
- return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
-}
-
-static void __wait_on_freeing_inode(struct bch_fs *c,
- struct bch_inode_info *inode,
- subvol_inum inum)
-{
- wait_queue_head_t *wq;
- struct wait_bit_queue_entry wait;
-
- wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&inode->v.i_lock);
-
- if (__bch2_inode_hash_find(c, inum) == inode)
- schedule_timeout(HZ * 10);
- finish_wait(wq, &wait.wq_entry);
-}
-
-static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
- subvol_inum inum)
-{
- struct bch_inode_info *inode;
-repeat:
- inode = __bch2_inode_hash_find(c, inum);
- if (inode) {
- spin_lock(&inode->v.i_lock);
- if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
- spin_unlock(&inode->v.i_lock);
- return NULL;
- }
- if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
- if (!trans) {
- __wait_on_freeing_inode(c, inode, inum);
- } else {
- int ret = drop_locks_do(trans,
- (__wait_on_freeing_inode(c, inode, inum), 0));
- if (ret)
- return ERR_PTR(ret);
- }
- goto repeat;
- }
- __iget(&inode->v);
- spin_unlock(&inode->v.i_lock);
- }
-
- return inode;
-}
-
-static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
-{
- spin_lock(&inode->v.i_lock);
- bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
- spin_unlock(&inode->v.i_lock);
-
- if (remove) {
- int ret = rhltable_remove(&c->vfs_inodes_by_inum_table,
- &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params);
- BUG_ON(ret);
-
- ret = rhashtable_remove_fast(&c->vfs_inodes_table,
- &inode->hash, bch2_vfs_inodes_params);
- BUG_ON(ret);
- inode->v.i_hash.pprev = NULL;
- /*
- * This pairs with the bch2_inode_hash_find() ->
- * __wait_on_freeing_inode() path
- */
- inode_wake_up_bit(&inode->v, __I_NEW);
- }
-}
-
-static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
- struct btree_trans *trans,
- struct bch_inode_info *inode)
-{
- struct bch_inode_info *old = inode;
-
- set_bit(EI_INODE_HASHED, &inode->ei_flags);
-retry:
- if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
- &inode->ei_inum,
- &inode->hash,
- bch2_vfs_inodes_params))) {
- old = bch2_inode_hash_find(c, trans, inode->ei_inum);
- if (!old)
- goto retry;
-
- clear_bit(EI_INODE_HASHED, &inode->ei_flags);
-
- /*
- * bcachefs doesn't use I_NEW; we have no use for it since we
- * only insert fully created inodes in the inode hash table. But
- * discard_new_inode() expects it to be set...
- */
- inode->v.i_state |= I_NEW;
- /*
- * We don't want bch2_evict_inode() to delete the inode on disk,
- * we just raced and had another inode in cache. Normally new
- * inodes don't have nlink == 0 - except tmpfiles do...
- */
- set_nlink(&inode->v, 1);
- discard_new_inode(&inode->v);
- return old;
- } else {
- int ret = rhltable_insert(&c->vfs_inodes_by_inum_table,
- &inode->by_inum_hash,
- bch2_vfs_inodes_by_inum_params);
- BUG_ON(ret);
-
- inode_fake_hash(&inode->v);
-
- inode_sb_list_add(&inode->v);
-
- mutex_lock(&c->vfs_inodes_lock);
- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
- mutex_unlock(&c->vfs_inodes_lock);
- return inode;
- }
-}
-
-#define memalloc_flags_do(_flags, _do) \
-({ \
- unsigned _saved_flags = memalloc_flags_save(_flags); \
- typeof(_do) _ret = _do; \
- memalloc_noreclaim_restore(_saved_flags); \
- _ret; \
-})
-
-static struct inode *bch2_alloc_inode(struct super_block *sb)
-{
- BUG();
-}
-
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
-{
- struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
- bch2_inode_cache, gfp);
- if (!inode)
- return NULL;
-
- inode_init_once(&inode->v);
- mutex_init(&inode->ei_update_lock);
- two_state_lock_init(&inode->ei_pagecache_lock);
- INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
- inode->ei_flags = 0;
- mutex_init(&inode->ei_quota_lock);
- memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
- if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
- kmem_cache_free(bch2_inode_cache, inode);
- return NULL;
- }
-
- return inode;
-}
-
-/*
- * Allocate a new inode, dropping/retaking btree locks if necessary:
- */
-static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
-{
- struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
-
- if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
- if (ret && inode) {
- __destroy_inode(&inode->v);
- kmem_cache_free(bch2_inode_cache, inode);
- }
- if (ret)
- return ERR_PTR(ret);
- }
-
- return inode;
-}
-
-static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *bi,
- struct bch_subvolume *subvol)
-{
- struct bch_inode_info *inode = bch2_new_inode(trans);
- if (IS_ERR(inode))
- return inode;
-
- bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
-
- return bch2_inode_hash_insert(trans->c, trans, inode);
-
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
-{
- struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
- if (inode)
- return &inode->v;
-
- struct btree_trans *trans = bch2_trans_get(c);
-
- struct bch_inode_unpacked inode_u;
- struct bch_subvolume subvol;
- int ret = lockrestart_do(trans,
- bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
- PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
- bch2_trans_put(trans);
-
- return ret ? ERR_PTR(ret) : &inode->v;
-}
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *idmap,
- struct bch_inode_info *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev, subvol_inum snapshot_src,
- unsigned flags)
-{
- struct bch_fs *c = dir->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct bch_inode_unpacked dir_u;
- struct bch_inode_info *inode;
- struct bch_inode_unpacked inode_u;
- struct posix_acl *default_acl = NULL, *acl = NULL;
- subvol_inum inum;
- struct bch_subvolume subvol;
- u64 journal_seq = 0;
- kuid_t kuid;
- kgid_t kgid;
- int ret;
-
- /*
- * preallocate acls + vfs inode before btree transaction, so that
- * nothing can fail after the transaction succeeds:
- */
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
- if (ret)
- return ERR_PTR(ret);
-#endif
- inode = __bch2_new_inode(c, GFP_NOFS);
- if (unlikely(!inode)) {
- inode = ERR_PTR(-ENOMEM);
- goto err;
- }
-
- bch2_inode_init_early(c, &inode_u);
-
- if (!(flags & BCH_CREATE_TMPFILE))
- mutex_lock(&dir->ei_update_lock);
-
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
- kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
- ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
- bch2_create_trans(trans,
- inode_inum(dir), &dir_u, &inode_u,
- !(flags & BCH_CREATE_TMPFILE)
- ? &dentry->d_name : NULL,
- from_kuid(i_user_ns(&dir->v), kuid),
- from_kgid(i_user_ns(&dir->v), kgid),
- mode, rdev,
- default_acl, acl, snapshot_src, flags) ?:
- bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
- KEY_TYPE_QUOTA_PREALLOC);
- if (unlikely(ret))
- goto err_before_quota;
-
- inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
- inum.inum = inode_u.bi_inum;
-
- ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_trans_commit(trans, NULL, &journal_seq, 0);
- if (unlikely(ret)) {
- bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
- KEY_TYPE_QUOTA_WARN);
-err_before_quota:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- goto err_trans;
- }
-
- if (!(flags & BCH_CREATE_TMPFILE)) {
- bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
- mutex_unlock(&dir->ei_update_lock);
- }
-
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-
- set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
- set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
-
- /*
- * we must insert the new inode into the inode cache before calling
- * bch2_trans_exit() and dropping locks, else we could race with another
- * thread pulling the inode in and modifying it:
- *
- * also, calling bch2_inode_hash_insert() without passing in the
- * transaction object is sketchy - if we could ever end up in
- * __wait_on_freeing_inode(), we'd risk deadlock.
- *
- * But that shouldn't be possible, since we still have the inode locked
- * that we just created, and we _really_ can't take a transaction
- * restart here.
- */
- inode = bch2_inode_hash_insert(c, NULL, inode);
- bch2_trans_put(trans);
-err:
- posix_acl_release(default_acl);
- posix_acl_release(acl);
- return inode;
-err_trans:
- if (!(flags & BCH_CREATE_TMPFILE))
- mutex_unlock(&dir->ei_update_lock);
-
- bch2_trans_put(trans);
- make_bad_inode(&inode->v);
- iput(&inode->v);
- inode = ERR_PTR(ret);
- goto err;
-}
-
-/* methods */
-
-static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
- subvol_inum dir, struct bch_hash_info *dir_hash_info,
- const struct qstr *name)
-{
- struct bch_fs *c = trans->c;
- subvol_inum inum = {};
- struct printbuf buf = PRINTBUF;
-
- struct qstr lookup_name;
- int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
- if (ret)
- return ERR_PTR(ret);
-
- struct btree_iter dirent_iter = {};
- struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- dir_hash_info, dir, &lookup_name, 0);
- ret = bkey_err(k);
- if (ret)
- return ERR_PTR(ret);
-
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- ret = bch2_dirent_read_target(trans, dir, d, &inum);
- if (ret > 0)
- ret = -ENOENT;
- if (ret)
- goto err;
-
- struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
- if (inode)
- goto out;
-
- /*
- * Note: if check/repair needs it, we commit before
- * bch2_inode_hash_init_insert(), as after that point we can't take a
- * restart - not in the top level loop with a commit_do(), like we
- * usually do:
- */
-
- struct bch_subvolume subvol;
- struct bch_inode_unpacked inode_u;
- ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
- bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
-
- /*
- * don't remove it: check_inodes might find another inode that points
- * back to this dirent
- */
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
- c, "dirent to missing inode:\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
- if (ret)
- goto err;
-out:
- bch2_trans_iter_exit(trans, &dirent_iter);
- printbuf_exit(&buf);
- return inode;
-err:
- inode = ERR_PTR(ret);
- goto out;
-}
-
-static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
- unsigned int flags)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-
- struct bch_inode_info *inode;
- bch2_trans_do(c,
- PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
- &hash, &dentry->d_name)));
- if (IS_ERR(inode))
- inode = NULL;
-
- if (!inode && IS_CASEFOLDED(vdir)) {
- /*
- * Do not cache a negative dentry in casefolded directories
- * as it would need to be invalidated in the following situation:
- * - Lookup file "blAH" in a casefolded directory
- * - Creation of file "BLAH" in a casefolded directory
- * - Lookup file "blAH" in a casefolded directory
- * which would fail if we had a negative dentry.
- *
- * We should come back to this when VFS has a method to handle
- * this edgecase.
- */
- return NULL;
- }
-
- return d_splice_alias(&inode->v, dentry);
-}
-
-static int bch2_mknod(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- struct bch_inode_info *inode =
- __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
- (subvol_inum) { 0 }, 0);
-
- if (IS_ERR(inode))
- return bch2_err_class(PTR_ERR(inode));
-
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
-static int bch2_create(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry,
- umode_t mode, bool excl)
-{
- return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
-}
-
-static int __bch2_link(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch_inode_info *dir,
- struct dentry *dentry)
-{
- struct bch_inode_unpacked dir_u, inode_u;
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
- struct btree_trans *trans = bch2_trans_get(c);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_link_trans(trans,
- inode_inum(dir), &dir_u,
- inode_inum(inode), &inode_u,
- &dentry->d_name));
-
- if (likely(!ret)) {
- bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
- bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
- }
-
- bch2_trans_put(trans);
- mutex_unlock(&inode->ei_update_lock);
- return ret;
-}
-
-static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
- struct dentry *dentry)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
- int ret;
-
- lockdep_assert_held(&inode->v.i_rwsem);
-
- ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
- bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- __bch2_link(c, inode, dir, dentry);
- if (unlikely(ret))
- return bch2_err_class(ret);
-
- ihold(&inode->v);
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
-int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
- bool deleting_snapshot)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir);
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct bch_inode_unpacked dir_u, inode_u;
- int ret;
-
- bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
- struct btree_trans *trans = bch2_trans_get(c);
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_unlink_trans(trans,
- inode_inum(dir), &dir_u,
- &inode_u, &dentry->d_name,
- deleting_snapshot));
- if (unlikely(ret))
- goto err;
-
- bch2_inode_update_after_write(trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
- bch2_inode_update_after_write(trans, inode, &inode_u,
- ATTR_MTIME);
-
- if (inode_u.bi_subvol) {
- /*
- * Subvolume deletion is asynchronous, but we still want to tell
- * the VFS that it's been deleted here:
- */
- set_nlink(&inode->v, 0);
- }
-
- if (IS_CASEFOLDED(vdir))
- d_invalidate(dentry);
-err:
- bch2_trans_put(trans);
- bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
-
- return ret;
-}
-
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
-{
- struct bch_inode_info *dir= to_bch_ei(vdir);
- struct bch_fs *c = dir->v.i_sb->s_fs_info;
-
- int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
- __bch2_unlink(vdir, dentry, false);
- return bch2_err_class(ret);
-}
-
-static int bch2_symlink(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry,
- const char *symname)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
- struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
- int ret;
-
- inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
- (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
- if (IS_ERR(inode))
- return bch2_err_class(PTR_ERR(inode));
-
- inode_lock(&inode->v);
- ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
- inode_unlock(&inode->v);
-
- if (unlikely(ret))
- goto err;
-
- ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
- if (unlikely(ret))
- goto err;
-
- ret = __bch2_link(c, inode, dir, dentry);
- if (unlikely(ret))
- goto err;
-
- d_instantiate(dentry, &inode->v);
- return 0;
-err:
- iput(&inode->v);
- return bch2_err_class(ret);
-}
-
-static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry, umode_t mode)
-{
- return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
-}
-
-static int bch2_rename2(struct mnt_idmap *idmap,
- struct inode *src_vdir, struct dentry *src_dentry,
- struct inode *dst_vdir, struct dentry *dst_dentry,
- unsigned flags)
-{
- struct bch_fs *c = src_vdir->i_sb->s_fs_info;
- struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
- struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
- struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
- struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
- struct bch_inode_unpacked dst_dir_u, src_dir_u;
- struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
- struct btree_trans *trans;
- enum bch_rename_mode mode = flags & RENAME_EXCHANGE
- ? BCH_RENAME_EXCHANGE
- : dst_dentry->d_inode
- ? BCH_RENAME_OVERWRITE : BCH_RENAME;
- bool whiteout = !!(flags & RENAME_WHITEOUT);
- int ret;
-
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
- return -EINVAL;
-
- if (mode == BCH_RENAME_OVERWRITE) {
- ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
- 0, LLONG_MAX);
- if (ret)
- return ret;
- }
-
- bch2_lock_inodes(INODE_UPDATE_LOCK,
- src_dir,
- dst_dir,
- src_inode,
- dst_inode);
-
- trans = bch2_trans_get(c);
-
- ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
- bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
- if (ret)
- goto err_tx_restart;
-
- if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, src_inode,
- dst_dir->ei_qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err;
- }
-
- if (mode == BCH_RENAME_EXCHANGE &&
- inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, dst_inode,
- src_dir->ei_qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err;
- }
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_rename_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- inode_inum(dst_dir), &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode);
- if (unlikely(ret))
- goto err_tx_restart;
-
- if (whiteout) {
- whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
- ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
- if (unlikely(ret))
- goto err_tx_restart;
- bch2_inode_init_early(c, whiteout_inode_u);
-
- ret = bch2_create_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- whiteout_inode_u,
- &src_dentry->d_name,
- from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
- from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
- S_IFCHR|WHITEOUT_MODE, 0,
- NULL, NULL, (subvol_inum) { 0 }, 0) ?:
- bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
- KEY_TYPE_QUOTA_PREALLOC);
- if (unlikely(ret))
- goto err_tx_restart;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0);
- if (unlikely(ret)) {
-err_tx_restart:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- goto err;
- }
-
- BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
- BUG_ON(dst_inode &&
- dst_inode->v.i_ino != dst_inode_u.bi_inum);
-
- bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
- if (src_dir != dst_dir)
- bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
- ATTR_MTIME|ATTR_CTIME|ATTR_SIZE);
-
- bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
- ATTR_CTIME);
-
- if (dst_inode)
- bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
- ATTR_CTIME);
-err:
- bch2_trans_put(trans);
-
- bch2_fs_quota_transfer(c, src_inode,
- bch_qid(&src_inode->ei_inode),
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_NOCHECK);
- if (dst_inode)
- bch2_fs_quota_transfer(c, dst_inode,
- bch_qid(&dst_inode->ei_inode),
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_NOCHECK);
-
- bch2_unlock_inodes(INODE_UPDATE_LOCK,
- src_dir,
- dst_dir,
- src_inode,
- dst_inode);
-
- return bch2_err_class(ret);
-}
-
-static void bch2_setattr_copy(struct mnt_idmap *idmap,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- struct iattr *attr)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- unsigned int ia_valid = attr->ia_valid;
- kuid_t kuid;
- kgid_t kgid;
-
- if (ia_valid & ATTR_UID) {
- kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
- bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
- }
- if (ia_valid & ATTR_GID) {
- kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
- bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
- }
-
- if (ia_valid & ATTR_SIZE)
- bi->bi_size = attr->ia_size;
-
- if (ia_valid & ATTR_ATIME)
- bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
-
- if (ia_valid & ATTR_MODE) {
- umode_t mode = attr->ia_mode;
- kgid_t gid = ia_valid & ATTR_GID
- ? kgid
- : inode->v.i_gid;
-
- if (!in_group_or_capable(idmap, &inode->v,
- make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
- mode &= ~S_ISGID;
- bi->bi_mode = mode;
- }
-}
-
-int bch2_setattr_nonsize(struct mnt_idmap *idmap,
- struct bch_inode_info *inode,
- struct iattr *attr)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_qid qid;
- struct btree_trans *trans;
- struct btree_iter inode_iter = {};
- struct bch_inode_unpacked inode_u;
- struct posix_acl *acl = NULL;
- kuid_t kuid;
- kgid_t kgid;
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
-
- qid = inode->ei_qid;
-
- if (attr->ia_valid & ATTR_UID) {
- kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
- qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
- }
-
- if (attr->ia_valid & ATTR_GID) {
- kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
- qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
- }
-
- ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- goto err;
-
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
- kfree(acl);
- acl = NULL;
-
- ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
- BTREE_ITER_intent);
- if (ret)
- goto btree_err;
-
- bch2_setattr_copy(idmap, inode, &inode_u, attr);
-
- if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
- inode_u.bi_mode, &acl);
- if (ret)
- goto btree_err;
- }
-
- ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-btree_err:
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (unlikely(ret))
- goto err_trans;
-
- bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
-
- if (acl)
- set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
-err_trans:
- bch2_trans_put(trans);
-err:
- mutex_unlock(&inode->ei_update_lock);
-
- return bch2_err_class(ret);
-}
-
-static int bch2_getattr(struct mnt_idmap *idmap,
- const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned query_flags)
-{
- struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
- vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
-
- stat->dev = inode->v.i_sb->s_dev;
- stat->ino = inode->v.i_ino;
- stat->mode = inode->v.i_mode;
- stat->nlink = inode->v.i_nlink;
- stat->uid = vfsuid_into_kuid(vfsuid);
- stat->gid = vfsgid_into_kgid(vfsgid);
- stat->rdev = inode->v.i_rdev;
- stat->size = i_size_read(&inode->v);
- stat->atime = inode_get_atime(&inode->v);
- stat->mtime = inode_get_mtime(&inode->v);
- stat->ctime = inode_get_ctime(&inode->v);
- stat->blksize = block_bytes(c);
- stat->blocks = inode->v.i_blocks;
-
- stat->subvol = inode->ei_inum.subvol;
- stat->result_mask |= STATX_SUBVOL;
-
- if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
- stat->result_mask |= STATX_DIOALIGN;
- /*
- * this is incorrect; we should be tracking this in superblock,
- * and checking the alignment of open devices
- */
- stat->dio_mem_align = SECTOR_SIZE;
- stat->dio_offset_align = block_bytes(c);
- }
-
- if (request_mask & STATX_BTIME) {
- stat->result_mask |= STATX_BTIME;
- stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
- }
-
- if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
- stat->attributes |= STATX_ATTR_IMMUTABLE;
- stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
-
- if (inode->ei_inode.bi_flags & BCH_INODE_append)
- stat->attributes |= STATX_ATTR_APPEND;
- stat->attributes_mask |= STATX_ATTR_APPEND;
-
- if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
- stat->attributes |= STATX_ATTR_NODUMP;
- stat->attributes_mask |= STATX_ATTR_NODUMP;
-
- return 0;
-}
-
-static int bch2_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *iattr)
-{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
-
- lockdep_assert_held(&inode->v.i_rwsem);
-
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- setattr_prepare(idmap, dentry, iattr);
- if (ret)
- return ret;
-
- return iattr->ia_valid & ATTR_SIZE
- ? bchfs_truncate(idmap, inode, iattr)
- : bch2_setattr_nonsize(idmap, inode, iattr);
-}
-
-static int bch2_tmpfile(struct mnt_idmap *idmap,
- struct inode *vdir, struct file *file, umode_t mode)
-{
- struct bch_inode_info *inode =
- __bch2_create(idmap, to_bch_ei(vdir),
- file->f_path.dentry, mode, 0,
- (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-
- if (IS_ERR(inode))
- return bch2_err_class(PTR_ERR(inode));
-
- d_mark_tmpfile(file, &inode->v);
- d_instantiate(file->f_path.dentry, &inode->v);
- return finish_open_simple(file, 0);
-}
-
-struct bch_fiemap_extent {
- struct bkey_buf kbuf;
- unsigned flags;
-};
-
-static int bch2_fill_extent(struct bch_fs *c,
- struct fiemap_extent_info *info,
- struct bch_fiemap_extent *fe)
-{
- struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
- unsigned flags = fe->flags;
-
- BUG_ON(!k.k->size);
-
- if (bkey_extent_is_direct_data(k.k)) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- int ret;
-
- if (k.k->type == KEY_TYPE_reflink_v)
- flags |= FIEMAP_EXTENT_SHARED;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int flags2 = 0;
- u64 offset = p.ptr.offset;
-
- if (p.ptr.unwritten)
- flags2 |= FIEMAP_EXTENT_UNWRITTEN;
-
- if (p.crc.compression_type)
- flags2 |= FIEMAP_EXTENT_ENCODED;
- else
- offset += p.crc.offset;
-
- if ((offset & (block_sectors(c) - 1)) ||
- (k.k->size & (block_sectors(c) - 1)))
- flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
-
- ret = fiemap_fill_next_extent(info,
- bkey_start_offset(k.k) << 9,
- offset << 9,
- k.k->size << 9, flags|flags2);
- if (ret)
- return ret;
- }
-
- return 0;
- } else if (bkey_extent_is_inline_data(k.k)) {
- return fiemap_fill_next_extent(info,
- bkey_start_offset(k.k) << 9,
- 0, k.k->size << 9,
- flags|
- FIEMAP_EXTENT_DATA_INLINE);
- } else if (k.k->type == KEY_TYPE_reservation) {
- return fiemap_fill_next_extent(info,
- bkey_start_offset(k.k) << 9,
- 0, k.k->size << 9,
- flags|
- FIEMAP_EXTENT_DELALLOC|
- FIEMAP_EXTENT_UNWRITTEN);
- } else {
- BUG();
- }
-}
-
-/*
- * Scan a range of an inode for data in pagecache.
- *
- * Intended to be retryable, so don't modify the output params until success is
- * imminent.
- */
-static int
-bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
- bool nonblock)
-{
- loff_t dstart, dend;
-
- dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
- if (dstart < 0)
- return dstart;
-
- if (dstart == *end) {
- *start = dstart;
- return 0;
- }
-
- dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
- if (dend < 0)
- return dend;
-
- /* race */
- BUG_ON(dstart == dend);
-
- *start = dstart;
- *end = dend;
- return 0;
-}
-
-/*
- * Scan a range of pagecache that corresponds to a file mapping hole in the
- * extent btree. If data is found, fake up an extent key so it looks like a
- * delalloc extent to the rest of the fiemap processing code.
- */
-static int
-bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
- u64 start, u64 end, struct bch_fiemap_extent *cur)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_extent *delextent;
- struct bch_extent_ptr ptr = {};
- loff_t dstart = start << 9, dend = end << 9;
- int ret;
-
- /*
- * We hold btree locks here so we cannot block on folio locks without
- * dropping trans locks first. Run a nonblocking scan for the common
- * case of no folios over holes and fall back on failure.
- *
- * Note that dropping locks like this is technically racy against
- * writeback inserting to the extent tree, but a non-sync fiemap scan is
- * fundamentally racy with writeback anyways. Therefore, just report the
- * range as delalloc regardless of whether we have to cycle trans locks.
- */
- ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
- if (ret == -EAGAIN)
- ret = drop_locks_do(trans,
- bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
- if (ret < 0)
- return ret;
-
- /*
- * Create a fake extent key in the buffer. We have to add a dummy extent
- * pointer for the fill code to add an extent entry. It's explicitly
- * zeroed to reflect delayed allocation (i.e. phys offset 0).
- */
- bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
- delextent = bkey_extent_init(cur->kbuf.k);
- delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
- delextent->k.size = (dend - dstart) >> 9;
- bch2_bkey_append_ptr(&delextent->k_i, ptr);
-
- cur->flags = FIEMAP_EXTENT_DELALLOC;
-
- return 0;
-}
-
-static int bch2_next_fiemap_extent(struct btree_trans *trans,
- struct bch_inode_info *inode,
- u64 start, u64 end,
- struct bch_fiemap_extent *cur)
-{
- u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
- if (ret)
- return ret;
-
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inode->ei_inum.inum, start, snapshot), 0);
-
- struct bkey_s_c k =
- bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
-
- ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
- if (ret)
- goto err;
-
- struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
-
- /*
- * Does the pagecache or the btree take precedence?
- *
- * It _should_ be the pagecache, so that we correctly report delalloc
- * extents when dirty in the pagecache (we're COW, after all).
- *
- * But we'd have to add per-sector writeback tracking to
- * bch_folio_state, otherwise we report delalloc extents for clean
- * cached data in the pagecache.
- *
- * We should do this, but even then fiemap won't report stable mappings:
- * on bcachefs data moves around in the background (copygc, rebalance)
- * and we don't provide a way for userspace to lock that out.
- */
- if (k.k &&
- bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
- pagecache_start)) {
- bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
- bch2_cut_front(iter.pos, cur->kbuf.k);
- bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
- cur->flags = 0;
- } else if (k.k) {
- bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
- }
-
- if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
- unsigned sectors = cur->kbuf.k->k.size;
- s64 offset_into_extent = 0;
- enum btree_id data_btree = BTREE_ID_extents;
- ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
- &cur->kbuf);
- if (ret)
- goto err;
-
- struct bkey_i *k = cur->kbuf.k;
- sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
-
- bch2_cut_front(POS(k->k.p.inode,
- bkey_start_offset(&k->k) + offset_into_extent),
- k);
- bch2_key_resize(&k->k, sectors);
- k->k.p = iter.pos;
- k->k.p.offset += k->k.size;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
- u64 start, u64 len)
-{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *ei = to_bch_ei(vinode);
- struct btree_trans *trans;
- struct bch_fiemap_extent cur, prev;
- int ret = 0;
-
- ret = fiemap_prep(&ei->v, info, start, &len, 0);
- if (ret)
- return ret;
-
- if (start + len < start)
- return -EINVAL;
-
- start >>= 9;
- u64 end = (start + len) >> 9;
-
- bch2_bkey_buf_init(&cur.kbuf);
- bch2_bkey_buf_init(&prev.kbuf);
- bkey_init(&prev.kbuf.k->k);
-
- trans = bch2_trans_get(c);
-
- while (start < end) {
- ret = lockrestart_do(trans,
- bch2_next_fiemap_extent(trans, ei, start, end, &cur));
- if (ret)
- goto err;
-
- BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
- BUG_ON(cur.kbuf.k->k.p.offset > end);
-
- if (bkey_start_offset(&cur.kbuf.k->k) == end)
- break;
-
- start = cur.kbuf.k->k.p.offset;
-
- if (!bkey_deleted(&prev.kbuf.k->k)) {
- bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info, &prev);
- if (ret)
- goto err;
- }
-
- bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
- prev.flags = cur.flags;
- }
-
- if (!bkey_deleted(&prev.kbuf.k->k)) {
- bch2_trans_unlock(trans);
- prev.flags |= FIEMAP_EXTENT_LAST;
- ret = bch2_fill_extent(c, info, &prev);
- }
-err:
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&cur.kbuf, c);
- bch2_bkey_buf_exit(&prev.kbuf, c);
-
- return bch2_err_class(ret < 0 ? ret : 0);
-}
-
-static const struct vm_operations_struct bch_vm_ops = {
- .fault = bch2_page_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = bch2_page_mkwrite,
-};
-
-static int bch2_mmap_prepare(struct vm_area_desc *desc)
-{
- file_accessed(desc->file);
-
- desc->vm_ops = &bch_vm_ops;
- return 0;
-}
-
-/* Directories: */
-
-static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
-{
- return generic_file_llseek_size(file, offset, whence,
- S64_MAX, S64_MAX);
-}
-
-static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
-
- if (!dir_emit_dots(file, ctx))
- return 0;
-
- int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
-
- bch_err_fn(c, ret);
- return bch2_err_class(ret);
-}
-
-static int bch2_open(struct inode *vinode, struct file *file)
-{
- if (file->f_flags & (O_WRONLY|O_RDWR)) {
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
- if (ret)
- return ret;
- }
-
- file->f_mode |= FMODE_CAN_ODIRECT;
-
- return generic_file_open(vinode, file);
-}
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_sync] = FS_SYNC_FL,
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
- [__BCH_INODE_append] = FS_APPEND_FL,
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
- [__BCH_INODE_sync] = FS_XFLAG_SYNC,
- [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
- [__BCH_INODE_append] = FS_XFLAG_APPEND,
- [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
- [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
-};
-
-static int bch2_fileattr_get(struct dentry *dentry,
- struct file_kattr *fa)
-{
- struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
-
- if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
- fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
- if (bch2_inode_casefold(c, &inode->ei_inode))
- fa->flags |= FS_CASEFOLD_FL;
-
- fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
- return 0;
-}
-
-struct flags_set {
- unsigned mask;
- unsigned flags;
- unsigned projid;
- bool set_project;
- bool set_casefold;
- bool casefold;
-};
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = trans->c;
- struct flags_set *s = p;
-
- /*
- * We're relying on btree locking here for exclusion with other ioctl
- * calls - use the flags in the btree (@bi), not inode->i_flags:
- */
- if (!S_ISREG(bi->bi_mode) &&
- !S_ISDIR(bi->bi_mode) &&
- (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
- return -EINVAL;
-
- if (s->casefold != bch2_inode_casefold(c, bi)) {
- int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold);
- if (ret)
- return ret;
- }
-
- if (s->set_project) {
- bi->bi_project = s->projid;
- bi->bi_fields_set |= BIT(Inode_opt_project);
- }
-
- bi->bi_flags &= ~s->mask;
- bi->bi_flags |= s->flags;
-
- bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
- return 0;
-}
-
-static int bch2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry,
- struct file_kattr *fa)
-{
- struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct flags_set s = {};
- int ret;
-
- if (fa->fsx_valid) {
- fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
- s.mask = map_defined(bch_flags_to_xflags);
- s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
- if (fa->fsx_xflags)
- return -EOPNOTSUPP;
-
- if (fa->fsx_projid >= U32_MAX)
- return -EINVAL;
-
- /*
- * inode fields accessible via the xattr interface are stored with a +1
- * bias, so that 0 means unset:
- */
- if ((inode->ei_inode.bi_project ||
- fa->fsx_projid) &&
- inode->ei_inode.bi_project != fa->fsx_projid + 1) {
- s.projid = fa->fsx_projid + 1;
- s.set_project = true;
- }
- }
-
- if (fa->flags_valid) {
- s.mask = map_defined(bch_flags_to_uflags);
-
- s.set_casefold = true;
- s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
- fa->flags &= ~FS_CASEFOLD_FL;
-
- s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
- if (fa->flags)
- return -EOPNOTSUPP;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- (s.set_project
- ? bch2_set_projid(c, inode, fa->fsx_projid)
- : 0) ?:
- bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
- return bch2_err_class(ret);
-}
-
-static const struct file_operations bch_file_operations = {
- .open = bch2_open,
- .llseek = bch2_llseek,
- .read_iter = bch2_read_iter,
- .write_iter = bch2_write_iter,
- .mmap_prepare = bch2_mmap_prepare,
- .get_unmapped_area = thp_get_unmapped_area,
- .fsync = bch2_fsync,
- .splice_read = filemap_splice_read,
- .splice_write = iter_file_splice_write,
- .fallocate = bch2_fallocate_dispatch,
- .unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = bch2_compat_fs_ioctl,
-#endif
- .remap_file_range = bch2_remap_file_range,
-};
-
-static const struct inode_operations bch_file_inode_operations = {
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .fiemap = bch2_fiemap,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
- .fileattr_get = bch2_fileattr_get,
- .fileattr_set = bch2_fileattr_set,
-};
-
-static const struct inode_operations bch_dir_inode_operations = {
- .lookup = bch2_lookup,
- .create = bch2_create,
- .link = bch2_link,
- .unlink = bch2_unlink,
- .symlink = bch2_symlink,
- .mkdir = bch2_mkdir,
- .rmdir = bch2_unlink,
- .mknod = bch2_mknod,
- .rename = bch2_rename2,
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .tmpfile = bch2_tmpfile,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
- .fileattr_get = bch2_fileattr_get,
- .fileattr_set = bch2_fileattr_set,
-};
-
-static const struct file_operations bch_dir_file_operations = {
- .llseek = bch2_dir_llseek,
- .read = generic_read_dir,
- .iterate_shared = bch2_vfs_readdir,
- .fsync = bch2_fsync,
- .unlocked_ioctl = bch2_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = bch2_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_symlink_inode_operations = {
- .get_link = page_get_link,
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
- .fileattr_get = bch2_fileattr_get,
- .fileattr_set = bch2_fileattr_set,
-};
-
-static const struct inode_operations bch_special_inode_operations = {
- .getattr = bch2_getattr,
- .setattr = bch2_setattr,
- .listxattr = bch2_xattr_list,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_inode_acl = bch2_get_acl,
- .set_acl = bch2_set_acl,
-#endif
- .fileattr_get = bch2_fileattr_get,
- .fileattr_set = bch2_fileattr_set,
-};
-
-static const struct address_space_operations bch_address_space_operations = {
- .read_folio = bch2_read_folio,
- .writepages = bch2_writepages,
- .readahead = bch2_readahead,
- .dirty_folio = filemap_dirty_folio,
- .write_begin = bch2_write_begin,
- .write_end = bch2_write_end,
- .invalidate_folio = bch2_invalidate_folio,
- .release_folio = bch2_release_folio,
-#ifdef CONFIG_MIGRATION
- .migrate_folio = filemap_migrate_folio,
-#endif
- .error_remove_folio = generic_error_remove_folio,
-};
-
-struct bcachefs_fid {
- u64 inum;
- u32 subvol;
- u32 gen;
-} __packed;
-
-struct bcachefs_fid_with_parent {
- struct bcachefs_fid fid;
- struct bcachefs_fid dir;
-} __packed;
-
-static int bcachefs_fid_valid(int fh_len, int fh_type)
-{
- switch (fh_type) {
- case FILEID_BCACHEFS_WITHOUT_PARENT:
- return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
- case FILEID_BCACHEFS_WITH_PARENT:
- return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
- default:
- return false;
- }
-}
-
-static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
-{
- return (struct bcachefs_fid) {
- .inum = inode->ei_inum.inum,
- .subvol = inode->ei_inum.subvol,
- .gen = inode->ei_inode.bi_generation,
- };
-}
-
-static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
- struct inode *vdir)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_inode_info *dir = to_bch_ei(vdir);
- int min_len;
-
- if (!S_ISDIR(inode->v.i_mode) && dir) {
- struct bcachefs_fid_with_parent *fid = (void *) fh;
-
- min_len = sizeof(*fid) / sizeof(u32);
- if (*len < min_len) {
- *len = min_len;
- return FILEID_INVALID;
- }
-
- fid->fid = bch2_inode_to_fid(inode);
- fid->dir = bch2_inode_to_fid(dir);
-
- *len = min_len;
- return FILEID_BCACHEFS_WITH_PARENT;
- } else {
- struct bcachefs_fid *fid = (void *) fh;
-
- min_len = sizeof(*fid) / sizeof(u32);
- if (*len < min_len) {
- *len = min_len;
- return FILEID_INVALID;
- }
- *fid = bch2_inode_to_fid(inode);
-
- *len = min_len;
- return FILEID_BCACHEFS_WITHOUT_PARENT;
- }
-}
-
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
- struct bcachefs_fid fid)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
- .subvol = fid.subvol,
- .inum = fid.inum,
- });
- if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
- iput(vinode);
- vinode = ERR_PTR(-ESTALE);
- }
- return vinode;
-}
-
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
- int fh_len, int fh_type)
-{
- struct bcachefs_fid *fid = (void *) _fid;
-
- if (!bcachefs_fid_valid(fh_len, fh_type))
- return NULL;
-
- return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
-}
-
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
- int fh_len, int fh_type)
-{
- struct bcachefs_fid_with_parent *fid = (void *) _fid;
-
- if (!bcachefs_fid_valid(fh_len, fh_type) ||
- fh_type != FILEID_BCACHEFS_WITH_PARENT)
- return NULL;
-
- return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
-}
-
-static struct dentry *bch2_get_parent(struct dentry *child)
-{
- struct bch_inode_info *inode = to_bch_ei(child->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- subvol_inum parent_inum = {
- .subvol = inode->ei_inode.bi_parent_subvol ?:
- inode->ei_inum.subvol,
- .inum = inode->ei_inode.bi_dir,
- };
-
- return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
-}
-
-static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
-{
- struct bch_inode_info *inode = to_bch_ei(child->d_inode);
- struct bch_inode_info *dir = to_bch_ei(parent->d_inode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter1;
- struct btree_iter iter2;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- struct bch_inode_unpacked inode_u;
- subvol_inum target;
- u32 snapshot;
- struct qstr dirent_name;
- unsigned name_len = 0;
- int ret;
-
- if (!S_ISDIR(dir->v.i_mode))
- return -EINVAL;
-
- trans = bch2_trans_get(c);
-
- bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
- POS(dir->ei_inode.bi_inum, 0), 0);
- bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
- POS(dir->ei_inode.bi_inum, 0), 0);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
- bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
-
- ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
- if (ret)
- goto err;
-
- if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
- bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
-
- k = bch2_btree_iter_peek_slot(trans, &iter1);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_dirent) {
- ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
- goto err;
- }
-
- d = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
- if (ret > 0)
- ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
- if (ret)
- goto err;
-
- if (subvol_inum_eq(target, inode->ei_inum))
- goto found;
- } else {
- /*
- * File with multiple hardlinks and our backref is to the wrong
- * directory - linear search:
- */
- for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
- if (k.k->p.inode > dir->ei_inode.bi_inum)
- break;
-
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- d = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
- if (ret < 0)
- break;
- if (ret)
- continue;
-
- if (subvol_inum_eq(target, inode->ei_inum))
- goto found;
- }
- }
-
- ret = -ENOENT;
- goto err;
-found:
- dirent_name = bch2_dirent_get_name(d);
-
- name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
- memcpy(name, dirent_name.name, name_len);
- name[name_len] = '\0';
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_iter_exit(trans, &iter1);
- bch2_trans_iter_exit(trans, &iter2);
- bch2_trans_put(trans);
-
- return ret;
-}
-
-static const struct export_operations bch_export_ops = {
- .encode_fh = bch2_encode_fh,
- .fh_to_dentry = bch2_fh_to_dentry,
- .fh_to_parent = bch2_fh_to_parent,
- .get_parent = bch2_get_parent,
- .get_name = bch2_get_name,
-};
-
-static void bch2_vfs_inode_init(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- struct bch_subvolume *subvol)
-{
- inode->v.i_ino = inum.inum;
- inode->ei_inum = inum;
- inode->ei_inode.bi_inum = inum.inum;
- bch2_inode_update_after_write(trans, inode, bi, ~0);
-
- inode->v.i_blocks = bi->bi_sectors;
- inode->v.i_rdev = bi->bi_dev;
- inode->v.i_generation = bi->bi_generation;
- inode->v.i_size = bi->bi_size;
-
- inode->ei_flags = 0;
- inode->ei_quota_reserved = 0;
- inode->ei_qid = bch_qid(bi);
-
- if (BCH_SUBVOLUME_SNAP(subvol))
- set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
- inode->v.i_mapping->a_ops = &bch_address_space_operations;
-
- switch (inode->v.i_mode & S_IFMT) {
- case S_IFREG:
- inode->v.i_op = &bch_file_inode_operations;
- inode->v.i_fop = &bch_file_operations;
- break;
- case S_IFDIR:
- inode->v.i_op = &bch_dir_inode_operations;
- inode->v.i_fop = &bch_dir_file_operations;
- break;
- case S_IFLNK:
- inode_nohighmem(&inode->v);
- inode->v.i_op = &bch_symlink_inode_operations;
- break;
- default:
- init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
- inode->v.i_op = &bch_special_inode_operations;
- break;
- }
-
- mapping_set_folio_min_order(inode->v.i_mapping,
- get_order(trans->c->opts.block_size));
-}
-
-static void bch2_free_inode(struct inode *vinode)
-{
- kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode));
-}
-
-static int inode_update_times_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
- bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
- bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
-
- return 0;
-}
-
-static int bch2_vfs_write_inode(struct inode *vinode,
- struct writeback_control *wbc)
-{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *inode = to_bch_ei(vinode);
- int ret;
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
- return bch2_err_class(ret);
-}
-
-static void bch2_evict_inode(struct inode *vinode)
-{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *inode = to_bch_ei(vinode);
- bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
-
- /*
- * evict() has waited for outstanding writeback, we'll do no more IO
- * through this inode: it's safe to remove from VFS inode hashtable here
- *
- * Do that now so that other threads aren't blocked from pulling it back
- * in, there's no reason for them to be:
- */
- if (!delete)
- bch2_inode_hash_remove(c, inode);
-
- truncate_inode_pages_final(&inode->v.i_data);
-
- clear_inode(&inode->v);
-
- BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
-
- if (delete) {
- bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
- KEY_TYPE_QUOTA_WARN);
- bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
- KEY_TYPE_QUOTA_WARN);
- int ret = bch2_inode_rm(c, inode_inum(inode));
- if (ret && !bch2_err_matches(ret, EROFS)) {
- bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
- inode->ei_inum.subvol,
- inode->ei_inum.inum);
- bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
- }
-
- /*
- * If we are deleting, we need it present in the vfs hash table
- * so that fsck can check if unlinked inodes are still open:
- */
- bch2_inode_hash_remove(c, inode);
- }
-
- mutex_lock(&c->vfs_inodes_lock);
- list_del_init(&inode->ei_vfs_inode_list);
- mutex_unlock(&c->vfs_inodes_lock);
-}
-
-void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
-{
- struct bch_inode_info *inode;
- DARRAY(struct bch_inode_info *) grabbed;
- bool clean_pass = false, this_pass_clean;
-
- /*
- * Initially, we scan for inodes without I_DONTCACHE, then mark them to
- * be pruned with d_mark_dontcache().
- *
- * Once we've had a clean pass where we didn't find any inodes without
- * I_DONTCACHE, we wait for them to be freed:
- */
-
- darray_init(&grabbed);
- darray_make_room(&grabbed, 1024);
-again:
- cond_resched();
- this_pass_clean = true;
-
- mutex_lock(&c->vfs_inodes_lock);
- list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
- if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
- continue;
-
- if (!(inode->v.i_state & I_DONTCACHE) &&
- !(inode->v.i_state & I_FREEING) &&
- igrab(&inode->v)) {
- this_pass_clean = false;
-
- if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
- iput(&inode->v);
- break;
- }
- } else if (clean_pass && this_pass_clean) {
- struct wait_bit_queue_entry wqe;
- struct wait_queue_head *wq_head;
-
- wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
- prepare_to_wait_event(wq_head, &wqe.wq_entry,
- TASK_UNINTERRUPTIBLE);
- mutex_unlock(&c->vfs_inodes_lock);
-
- schedule();
- finish_wait(wq_head, &wqe.wq_entry);
- goto again;
- }
- }
- mutex_unlock(&c->vfs_inodes_lock);
-
- darray_for_each(grabbed, i) {
- inode = *i;
- d_mark_dontcache(&inode->v);
- d_prune_aliases(&inode->v);
- iput(&inode->v);
- }
- grabbed.nr = 0;
-
- if (!clean_pass || !this_pass_clean) {
- clean_pass = this_pass_clean;
- goto again;
- }
-
- darray_exit(&grabbed);
-}
-
-static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct bch_fs *c = sb->s_fs_info;
- struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
- unsigned shift = sb->s_blocksize_bits - 9;
- /*
- * this assumes inodes take up 64 bytes, which is a decent average
- * number:
- */
- u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-
- buf->f_type = BCACHEFS_STATFS_MAGIC;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = usage.capacity >> shift;
- buf->f_bfree = usage.free >> shift;
- buf->f_bavail = avail_factor(usage.free) >> shift;
-
- buf->f_files = usage.nr_inodes + avail_inodes;
- buf->f_ffree = avail_inodes;
-
- buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
- buf->f_namelen = BCH_NAME_MAX;
-
- return 0;
-}
-
-static int bch2_sync_fs(struct super_block *sb, int wait)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret;
-
- trace_bch2_sync_fs(sb, wait);
-
- if (c->opts.journal_flush_disabled)
- return 0;
-
- if (!wait) {
- bch2_journal_flush_async(&c->journal, NULL);
- return 0;
- }
-
- ret = bch2_journal_flush(&c->journal);
- return bch2_err_class(ret);
-}
-
-static struct bch_fs *bch2_path_to_fs(const char *path)
-{
- struct bch_fs *c;
- dev_t dev;
- int ret;
-
- ret = lookup_bdev(path, &dev);
- if (ret)
- return ERR_PTR(ret);
-
- c = bch2_dev_to_fs(dev);
- if (c)
- closure_put(&c->cl);
- return c ?: ERR_PTR(-ENOENT);
-}
-
-static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
-{
- struct bch_fs *c = root->d_sb->s_fs_info;
- bool first = true;
-
- guard(rcu)();
- for_each_online_member_rcu(c, ca) {
- if (!first)
- seq_putc(seq, ':');
- first = false;
- seq_puts(seq, ca->disk_sb.sb_name);
- }
-
- return 0;
-}
-
-static int bch2_show_options(struct seq_file *seq, struct dentry *root)
-{
- struct bch_fs *c = root->d_sb->s_fs_info;
- struct printbuf buf = PRINTBUF;
-
- bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
- OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
- printbuf_nul_terminate(&buf);
- seq_printf(seq, ",%s", buf.buf);
-
- int ret = buf.allocation_failure ? -ENOMEM : 0;
- printbuf_exit(&buf);
- return ret;
-}
-
-static void bch2_put_super(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- __bch2_fs_stop(c);
-}
-
-/*
- * bcachefs doesn't currently integrate intwrite freeze protection but the
- * internal write references serve the same purpose. Therefore reuse the
- * read-only transition code to perform the quiesce. The caveat is that we don't
- * currently have the ability to block tasks that want a write reference while
- * the superblock is frozen. This is fine for now, but we should either add
- * blocking support or find a way to integrate sb_start_intwrite() and friends.
- */
-static int bch2_freeze(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- down_write(&c->state_lock);
- bch2_fs_read_only(c);
- up_write(&c->state_lock);
- return 0;
-}
-
-static int bch2_unfreeze(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret;
-
- if (test_bit(BCH_FS_emergency_ro, &c->flags))
- return 0;
-
- down_write(&c->state_lock);
- ret = bch2_fs_read_write(c);
- up_write(&c->state_lock);
- return ret;
-}
-
-static const struct super_operations bch_super_operations = {
- .alloc_inode = bch2_alloc_inode,
- .free_inode = bch2_free_inode,
- .write_inode = bch2_vfs_write_inode,
- .evict_inode = bch2_evict_inode,
- .sync_fs = bch2_sync_fs,
- .statfs = bch2_statfs,
- .show_devname = bch2_show_devname,
- .show_options = bch2_show_options,
- .put_super = bch2_put_super,
- .freeze_fs = bch2_freeze,
- .unfreeze_fs = bch2_unfreeze,
-};
-
-static int bch2_set_super(struct super_block *s, void *data)
-{
- s->s_fs_info = data;
- return 0;
-}
-
-static int bch2_noset_super(struct super_block *s, void *data)
-{
- return -EBUSY;
-}
-
-typedef DARRAY(struct bch_fs *) darray_fs;
-
-static int bch2_test_super(struct super_block *s, void *data)
-{
- struct bch_fs *c = s->s_fs_info;
- darray_fs *d = data;
-
- if (!c)
- return false;
-
- darray_for_each(*d, i)
- if (c != *i)
- return false;
- return true;
-}
-
-static int bch2_fs_get_tree(struct fs_context *fc)
-{
- struct bch_fs *c;
- struct super_block *sb;
- struct inode *vinode;
- struct bch2_opts_parse *opts_parse = fc->fs_private;
- struct bch_opts opts = opts_parse->opts;
- darray_const_str devs;
- darray_fs devs_to_fs = {};
- int ret;
-
- opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
- opt_set(opts, nostart, true);
-
- if (!fc->source || strlen(fc->source) == 0)
- return -EINVAL;
-
- ret = bch2_split_devs(fc->source, &devs);
- if (ret)
- return ret;
-
- darray_for_each(devs, i) {
- ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
- if (ret)
- goto err;
- }
-
- sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
- if (!IS_ERR(sb))
- goto got_sb;
-
- c = bch2_fs_open(&devs, &opts);
- ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- goto err;
-
- if (opt_defined(opts, discard))
- set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
-
- /* Some options can't be parsed until after the fs is started: */
- opts = bch2_opts_empty();
- ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false);
- if (ret)
- goto err_stop_fs;
-
- bch2_opts_apply(&c->opts, opts);
-
- ret = bch2_fs_start(c);
- if (ret)
- goto err_stop_fs;
-
- /*
- * We might be doing a RO mount because other options required it, or we
- * have no alloc info and it's a small image with no room to regenerate
- * it
- */
- if (c->opts.read_only)
- fc->sb_flags |= SB_RDONLY;
-
- sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
- ret = PTR_ERR_OR_ZERO(sb);
- if (ret)
- goto err_stop_fs;
-got_sb:
- c = sb->s_fs_info;
-
- if (sb->s_root) {
- if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
- ret = -EBUSY;
- goto err_put_super;
- }
- goto out;
- }
-
- sb->s_blocksize = block_bytes(c);
- sb->s_blocksize_bits = ilog2(block_bytes(c));
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_op = &bch_super_operations;
- sb->s_export_op = &bch_export_ops;
-#ifdef CONFIG_BCACHEFS_QUOTA
- sb->s_qcop = &bch2_quotactl_operations;
- sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
-#endif
- sb->s_xattr = bch2_xattr_handlers;
- sb->s_magic = BCACHEFS_STATFS_MAGIC;
- sb->s_time_gran = c->sb.nsec_per_time_unit;
- sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
- sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
- super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
-
- if (c->sb.multi_device)
- super_set_sysfs_name_uuid(sb);
- else
- strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name));
-
- sb->s_shrink->seeks = 0;
- c->vfs_sb = sb;
- strscpy(sb->s_id, c->name, sizeof(sb->s_id));
-
- ret = super_setup_bdi(sb);
- if (ret)
- goto err_put_super;
-
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
-
- scoped_guard(rcu) {
- for_each_online_member_rcu(c, ca) {
- struct block_device *bdev = ca->disk_sb.bdev;
-
- /* XXX: create an anonymous device for multi device filesystems */
- sb->s_bdev = bdev;
- sb->s_dev = bdev->bd_dev;
- break;
- }
- }
-
- c->dev = sb->s_dev;
-
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- if (c->opts.acl)
- sb->s_flags |= SB_POSIXACL;
-#endif
-
- sb->s_shrink->seeks = 0;
-
-#ifdef CONFIG_UNICODE
- if (bch2_fs_casefold_enabled(c))
- sb->s_encoding = c->cf_encoding;
- generic_set_sb_d_ops(sb);
-#endif
-
- vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
- ret = PTR_ERR_OR_ZERO(vinode);
- bch_err_msg(c, ret, "mounting: error getting root inode");
- if (ret)
- goto err_put_super;
-
- sb->s_root = d_make_root(vinode);
- if (!sb->s_root) {
- bch_err(c, "error mounting: error allocating root dentry");
- ret = -ENOMEM;
- goto err_put_super;
- }
-
- sb->s_flags |= SB_ACTIVE;
-out:
- fc->root = dget(sb->s_root);
-err:
- darray_exit(&devs_to_fs);
- bch2_darray_str_exit(&devs);
- if (ret)
- pr_err("error: %s", bch2_err_str(ret));
- /*
- * On an inconsistency error in recovery we might see an -EROFS derived
- * errorcode (from the journal), but we don't want to return that to
- * userspace as that causes util-linux to retry the mount RO - which is
- * confusing:
- */
- if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
- ret = -EIO;
- return bch2_err_class(ret);
-
-err_stop_fs:
- bch2_fs_stop(c);
- goto err;
-
-err_put_super:
- if (!sb->s_root)
- __bch2_fs_stop(c);
- deactivate_locked_super(sb);
- goto err;
-}
-
-static void bch2_kill_sb(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- generic_shutdown_super(sb);
- bch2_fs_free(c);
-}
-
-static void bch2_fs_context_free(struct fs_context *fc)
-{
- struct bch2_opts_parse *opts = fc->fs_private;
-
- if (opts) {
- printbuf_exit(&opts->parse_later);
- kfree(opts);
- }
-}
-
-static int bch2_fs_parse_param(struct fs_context *fc,
- struct fs_parameter *param)
-{
- /*
- * the "source" param, i.e., the name of the device(s) to mount,
- * is handled by the VFS layer.
- */
- if (!strcmp(param->key, "source"))
- return -ENOPARAM;
-
- struct bch2_opts_parse *opts = fc->fs_private;
- struct bch_fs *c = NULL;
-
- /* for reconfigure, we already have a struct bch_fs */
- if (fc->root)
- c = fc->root->d_sb->s_fs_info;
-
- int ret = bch2_parse_one_mount_opt(c, &opts->opts,
- &opts->parse_later, param->key,
- param->string);
- if (ret)
- pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret));
-
- return bch2_err_class(ret);
-}
-
-static int bch2_fs_reconfigure(struct fs_context *fc)
-{
- struct super_block *sb = fc->root->d_sb;
- struct bch2_opts_parse *opts = fc->fs_private;
- struct bch_fs *c = sb->s_fs_info;
- int ret = 0;
-
- opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
-
- if (opts->opts.read_only != c->opts.read_only) {
- down_write(&c->state_lock);
-
- if (opts->opts.read_only) {
- bch2_fs_read_only(c);
-
- sb->s_flags |= SB_RDONLY;
- } else {
- ret = bch2_fs_read_write(c);
- if (ret) {
- bch_err(c, "error going rw: %i", ret);
- up_write(&c->state_lock);
- ret = -EINVAL;
- goto err;
- }
-
- sb->s_flags &= ~SB_RDONLY;
- }
-
- c->opts.read_only = opts->opts.read_only;
-
- up_write(&c->state_lock);
- }
-
- if (opt_defined(opts->opts, errors))
- c->opts.errors = opts->opts.errors;
-err:
- return bch2_err_class(ret);
-}
-
-static const struct fs_context_operations bch2_context_ops = {
- .free = bch2_fs_context_free,
- .parse_param = bch2_fs_parse_param,
- .get_tree = bch2_fs_get_tree,
- .reconfigure = bch2_fs_reconfigure,
-};
-
-static int bch2_init_fs_context(struct fs_context *fc)
-{
- struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
-
- if (!opts)
- return -ENOMEM;
-
- opts->parse_later = PRINTBUF;
-
- fc->ops = &bch2_context_ops;
- fc->fs_private = opts;
-
- return 0;
-}
-
-void bch2_fs_vfs_exit(struct bch_fs *c)
-{
- if (c->vfs_inodes_by_inum_table.ht.tbl)
- rhltable_destroy(&c->vfs_inodes_by_inum_table);
- if (c->vfs_inodes_table.tbl)
- rhashtable_destroy(&c->vfs_inodes_table);
-}
-
-int bch2_fs_vfs_init(struct bch_fs *c)
-{
- return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?:
- rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params);
-}
-
-static struct file_system_type bcache_fs_type = {
- .owner = THIS_MODULE,
- .name = "bcachefs",
- .init_fs_context = bch2_init_fs_context,
- .kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
-};
-
-MODULE_ALIAS_FS("bcachefs");
-
-void bch2_vfs_exit(void)
-{
- unregister_filesystem(&bcache_fs_type);
- kmem_cache_destroy(bch2_inode_cache);
-}
-
-int __init bch2_vfs_init(void)
-{
- int ret = -ENOMEM;
-
- bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT);
- if (!bch2_inode_cache)
- goto err;
-
- ret = register_filesystem(&bcache_fs_type);
- if (ret)
- goto err;
-
- return 0;
-err:
- bch2_vfs_exit();
- return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
deleted file mode 100644
index dd2198541455..000000000000
--- a/fs/bcachefs/fs.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_H
-#define _BCACHEFS_FS_H
-
-#include "inode.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "quota_types.h"
-#include "two_state_shared_lock.h"
-
-#include <linux/seqlock.h>
-#include <linux/stat.h>
-
-struct bch_inode_info {
- struct inode v;
- struct rhash_head hash;
- struct rhlist_head by_inum_hash;
- subvol_inum ei_inum;
-
- struct list_head ei_vfs_inode_list;
- unsigned long ei_flags;
-
- struct mutex ei_update_lock;
- u64 ei_quota_reserved;
- unsigned long ei_last_dirtied;
- two_state_lock_t ei_pagecache_lock;
-
- struct mutex ei_quota_lock;
- struct bch_qid ei_qid;
-
- /*
- * When we've been doing nocow writes we'll need to issue flushes to the
- * underlying block devices
- *
- * XXX: a device may have had a flush issued by some other codepath. It
- * would be better to keep for each device a sequence number that's
- * incremented when we isusue a cache flush, and track here the sequence
- * number that needs flushing.
- */
- struct bch_devs_mask ei_devs_need_flush;
-
- /* copy of inode in btree: */
- struct bch_inode_unpacked ei_inode;
-};
-
-#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
-#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
-
-#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
-#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
-
-static inline subvol_inum inode_inum(struct bch_inode_info *inode)
-{
- return inode->ei_inum;
-}
-
-/*
- * Set if we've gotten a btree error for this inode, and thus the vfs inode and
- * btree inode may be inconsistent:
- */
-#define EI_INODE_ERROR 0
-
-/*
- * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
- * those:
- */
-#define EI_INODE_SNAPSHOT 1
-#define EI_INODE_HASHED 2
-
-#define to_bch_ei(_inode) \
- container_of_or_null(_inode, struct bch_inode_info, v)
-
-static inline int ptrcmp(void *l, void *r)
-{
- return cmp_int(l, r);
-}
-
-enum bch_inode_lock_op {
- INODE_PAGECACHE_BLOCK = (1U << 0),
- INODE_UPDATE_LOCK = (1U << 1),
-};
-
-#define bch2_lock_inodes(_locks, ...) \
-do { \
- struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
- unsigned i; \
- \
- bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
- \
- for (i = 1; i < ARRAY_SIZE(a); i++) \
- if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_get(a[i]);\
- if ((_locks) & INODE_UPDATE_LOCK) \
- mutex_lock_nested(&a[i]->ei_update_lock, i);\
- } \
-} while (0)
-
-#define bch2_unlock_inodes(_locks, ...) \
-do { \
- struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
- unsigned i; \
- \
- bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
- \
- for (i = 1; i < ARRAY_SIZE(a); i++) \
- if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_put(a[i]);\
- if ((_locks) & INODE_UPDATE_LOCK) \
- mutex_unlock(&a[i]->ei_update_lock); \
- } \
-} while (0)
-
-static inline struct bch_inode_info *file_bch_inode(struct file *file)
-{
- return to_bch_ei(file_inode(file));
-}
-
-static inline bool inode_attr_changing(struct bch_inode_info *dir,
- struct bch_inode_info *inode,
- enum inode_opt_id id)
-{
- return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
- bch2_inode_opt_get(&dir->ei_inode, id) !=
- bch2_inode_opt_get(&inode->ei_inode, id);
-}
-
-static inline bool inode_attrs_changing(struct bch_inode_info *dir,
- struct bch_inode_info *inode)
-{
- unsigned id;
-
- for (id = 0; id < Inode_opt_nr; id++)
- if (inode_attr_changing(dir, inode, id))
- return true;
-
- return false;
-}
-
-struct bch_inode_unpacked;
-
-#ifndef NO_BCACHEFS_FS
-
-struct bch_inode_info *
-__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
- struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
-
-int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
-
-int bch2_fs_quota_transfer(struct bch_fs *,
- struct bch_inode_info *,
- struct bch_qid,
- unsigned,
- enum quota_acct_mode);
-
-static inline int bch2_set_projid(struct bch_fs *c,
- struct bch_inode_info *inode,
- u32 projid)
-{
- struct bch_qid qid = inode->ei_qid;
-
- qid.q[QTYP_PRJ] = projid;
-
- return bch2_fs_quota_transfer(c, inode, qid,
- 1 << QTYP_PRJ,
- KEY_TYPE_QUOTA_PREALLOC);
-}
-
-struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
-
-/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct btree_trans *,
- struct bch_inode_info *,
- struct bch_inode_unpacked *, void *);
-
-void bch2_inode_update_after_write(struct btree_trans *,
- struct bch_inode_info *,
- struct bch_inode_unpacked *,
- unsigned);
-int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
- inode_set_fn, void *, unsigned);
-
-int bch2_setattr_nonsize(struct mnt_idmap *,
- struct bch_inode_info *,
- struct iattr *);
-int __bch2_unlink(struct inode *, struct dentry *, bool);
-
-void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
-
-void bch2_fs_vfs_exit(struct bch_fs *);
-int bch2_fs_vfs_init(struct bch_fs *);
-
-void bch2_vfs_exit(void);
-int bch2_vfs_init(void);
-
-#else
-
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
-
-static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
-
-static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
- snapshot_id_list *s) {}
-
-static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
-static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
-
-static inline void bch2_vfs_exit(void) {}
-static inline int bch2_vfs_init(void) { return 0; }
-
-#endif /* NO_BCACHEFS_FS */
-
-#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
deleted file mode 100644
index 15c1e890d299..000000000000
--- a/fs/bcachefs/fsck.c
+++ /dev/null
@@ -1,3363 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bcachefs_ioctl.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "darray.h"
-#include "dirent.h"
-#include "error.h"
-#include "fs.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "keylist.h"
-#include "namei.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "super.h"
-#include "thread_with_file.h"
-#include "xattr.h"
-
-#include <linux/bsearch.h>
-#include <linux/dcache.h> /* struct qstr */
-
-static int dirent_points_to_inode_nowarn(struct bch_fs *c,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- if (d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
- return 0;
- return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
-}
-
-static void dirent_inode_mismatch_msg(struct printbuf *out,
- struct bch_fs *c,
- struct bkey_s_c_dirent dirent,
- struct bch_inode_unpacked *inode)
-{
- prt_str(out, "inode points to dirent that does not point back:");
- prt_newline(out);
- bch2_bkey_val_to_text(out, c, dirent.s_c);
- prt_newline(out);
- bch2_inode_unpacked_to_text(out, inode);
-}
-
-static int dirent_points_to_inode(struct bch_fs *c,
- struct bkey_s_c_dirent dirent,
- struct bch_inode_unpacked *inode)
-{
- int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
- if (ret) {
- struct printbuf buf = PRINTBUF;
- dirent_inode_mismatch_msg(&buf, c, dirent, inode);
- bch_warn(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- return ret;
-}
-
-/*
- * XXX: this is handling transaction restarts without returning
- * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
- */
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
- u32 snapshot)
-{
- u64 sectors = 0;
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ({
- if (bkey_extent_is_allocation(k.k))
- sectors += k.k->size;
- 0;
- }));
-
- return ret ?: sectors;
-}
-
-static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
- u32 snapshot)
-{
- u64 subdirs = 0;
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ({
- if (k.k->type == KEY_TYPE_dirent &&
- bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
- subdirs++;
- 0;
- }));
-
- return ret ?: subdirs;
-}
-
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
-{
- struct bch_subvolume s;
- int ret = bch2_subvolume_get(trans, subvol, false, &s);
-
- *snapshot = le32_to_cpu(s.snapshot);
- *inum = le64_to_cpu(s.inode);
- return ret;
-}
-
-static int lookup_dirent_in_snapshot(struct btree_trans *trans,
- struct bch_hash_info hash_info,
- subvol_inum dir, struct qstr *name,
- u64 *target, unsigned *type, u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0, snapshot);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- *target = le64_to_cpu(d.v->d_inum);
- *type = d.v->d_type;
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-/*
- * Find any subvolume associated with a tree of snapshots
- * We can't rely on master_subvol - it might have been deleted.
- */
-static int find_snapshot_tree_subvol(struct btree_trans *trans,
- u32 tree_id, u32 *subvol)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
-
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- if (le32_to_cpu(s.v->tree) != tree_id)
- continue;
-
- if (s.v->subvol) {
- *subvol = le32_to_cpu(s.v->subvol);
- goto found;
- }
- }
- ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
-found:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
- struct bch_inode_unpacked *lostfound,
- u64 reattaching_inum)
-{
- struct bch_fs *c = trans->c;
- struct qstr lostfound_str = QSTR("lost+found");
- struct btree_iter lostfound_iter = {};
- u64 inum = 0;
- unsigned d_type = 0;
- int ret;
-
- struct bch_snapshot_tree st;
- ret = bch2_snapshot_tree_lookup(trans,
- bch2_snapshot_tree(c, snapshot), &st);
- if (ret)
- return ret;
-
- u32 subvolid;
- ret = find_snapshot_tree_subvol(trans,
- bch2_snapshot_tree(c, snapshot), &subvolid);
- bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
- bch2_snapshot_tree(c, snapshot));
- if (ret)
- return ret;
-
- struct bch_subvolume subvol;
- ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
- bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
- if (ret)
- return ret;
-
- if (!subvol.inode) {
- struct btree_iter iter;
- struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- 0, subvolume);
- ret = PTR_ERR_OR_ZERO(subvol);
- if (ret)
- return ret;
-
- subvol->v.inode = cpu_to_le64(reattaching_inum);
- bch2_trans_iter_exit(trans, &iter);
- }
-
- subvol_inum root_inum = {
- .subvol = subvolid,
- .inum = le64_to_cpu(subvol.inode)
- };
-
- struct bch_inode_unpacked root_inode;
- struct bch_hash_info root_hash_info;
- ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
- bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
- root_inum.inum, subvolid);
- if (ret)
- return ret;
-
- root_hash_info = bch2_hash_info_init(c, &root_inode);
-
- ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
- &lostfound_str, &inum, &d_type, snapshot);
- if (bch2_err_matches(ret, ENOENT))
- goto create_lostfound;
-
- bch_err_fn(c, ret);
- if (ret)
- return ret;
-
- if (d_type != DT_DIR) {
- bch_err(c, "error looking up lost+found: not a directory");
- return bch_err_throw(c, ENOENT_not_directory);
- }
-
- /*
- * The bch2_check_dirents pass has already run, dangling dirents
- * shouldn't exist here:
- */
- ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
- bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
- inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
- return ret;
-
-create_lostfound:
- /*
- * we always create lost+found in the root snapshot; we don't want
- * different branches of the snapshot tree to have different lost+found
- */
- snapshot = le32_to_cpu(st.root_snapshot);
- /*
- * XXX: we could have a nicer log message here if we had a nice way to
- * walk backpointers to print a path
- */
- struct printbuf path = PRINTBUF;
- ret = bch2_inum_to_path(trans, root_inum, &path);
- if (ret)
- goto err;
-
- bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
- path.buf, root_inum.subvol, snapshot);
- printbuf_exit(&path);
-
- u64 now = bch2_current_time(c);
- u64 cpu = raw_smp_processor_id();
-
- bch2_inode_init_early(c, lostfound);
- bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
- lostfound->bi_dir = root_inode.bi_inum;
- lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
-
- root_inode.bi_nlink++;
-
- ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
- ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
- if (ret)
- goto err;
-
- ret = bch2_dirent_create_snapshot(trans,
- 0, root_inode.bi_inum, snapshot, &root_hash_info,
- mode_to_type(lostfound->bi_mode),
- &lostfound_str,
- lostfound->bi_inum,
- &lostfound->bi_dir_offset,
- BTREE_UPDATE_internal_snapshot_node|
- STR_HASH_must_create) ?:
- bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
- BTREE_UPDATE_internal_snapshot_node);
-err:
- bch_err_msg(c, ret, "creating lost+found");
- bch2_trans_iter_exit(trans, &lostfound_iter);
- return ret;
-}
-
-static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
-{
- if (inode->bi_inum == BCACHEFS_ROOT_INO &&
- inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
- return false;
-
- /*
- * Subvolume roots are special: older versions of subvolume roots may be
- * disconnected, it's only the newest version that matters.
- *
- * We only keep a single dirent pointing to a subvolume root, i.e.
- * older versions of snapshots will not have a different dirent pointing
- * to the same subvolume root.
- *
- * This is because dirents that point to subvolumes are only visible in
- * the parent subvolume - versioning is not needed - and keeping them
- * around would break fsck, because when we're crossing subvolumes we
- * don't have a consistent snapshot ID to do check the inode <-> dirent
- * relationships.
- *
- * Thus, a subvolume root that's been renamed after a snapshot will have
- * a disconnected older version - that's expected.
- *
- * Note that taking a snapshot always updates the root inode (to update
- * the dirent backpointer), so a subvolume root inode with
- * BCH_INODE_has_child_snapshot is never visible.
- */
- if (inode->bi_subvol &&
- (inode->bi_flags & BCH_INODE_has_child_snapshot))
- return false;
-
- return !bch2_inode_has_backpointer(inode) &&
- !(inode->bi_flags & BCH_INODE_unlinked);
-}
-
-static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
- SPOS(d_pos.inode, d_pos.offset, snapshot),
- BTREE_ITER_intent|
- BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (bpos_eq(k.k->p, d_pos)) {
- /*
- * delet_at() doesn't work because the update path doesn't
- * internally use BTREE_ITER_with_updates yet
- */
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
- ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- goto err;
-
- bkey_init(&k->k);
- k->k.type = KEY_TYPE_whiteout;
- k->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
- struct bch_fs *c = trans->c;
- struct bch_inode_unpacked lostfound;
- char name_buf[20];
- int ret;
-
- u32 dirent_snapshot = inode->bi_snapshot;
- if (inode->bi_subvol) {
- inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
-
- struct btree_iter subvol_iter;
- struct bkey_i_subvolume *subvol =
- bch2_bkey_get_mut_typed(trans, &subvol_iter,
- BTREE_ID_subvolumes, POS(0, inode->bi_subvol),
- 0, subvolume);
- ret = PTR_ERR_OR_ZERO(subvol);
- if (ret)
- return ret;
-
- subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL;
- bch2_trans_iter_exit(trans, &subvol_iter);
-
- u64 root_inum;
- ret = subvol_lookup(trans, inode->bi_parent_subvol,
- &dirent_snapshot, &root_inum);
- if (ret)
- return ret;
-
- snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
- } else {
- snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
- }
-
- ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
- if (ret)
- return ret;
-
- bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum);
-
- lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
-
- /* ensure lost+found inode is also present in inode snapshot */
- if (!inode->bi_subvol) {
- BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
- lostfound.bi_snapshot = inode->bi_snapshot;
- }
-
- ret = __bch2_fsck_write_inode(trans, &lostfound);
- if (ret)
- return ret;
-
- struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
- struct qstr name = QSTR(name_buf);
-
- inode->bi_dir = lostfound.bi_inum;
-
- ret = bch2_dirent_create_snapshot(trans,
- inode->bi_parent_subvol, lostfound.bi_inum,
- dirent_snapshot,
- &dir_hash,
- inode_d_type(inode),
- &name,
- inode->bi_subvol ?: inode->bi_inum,
- &inode->bi_dir_offset,
- BTREE_UPDATE_internal_snapshot_node|
- STR_HASH_must_create);
- if (ret) {
- bch_err_msg(c, ret, "error creating dirent");
- return ret;
- }
-
- ret = __bch2_fsck_write_inode(trans, inode);
- if (ret)
- return ret;
-
- {
- CLASS(printbuf, buf)();
- ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum,
- inode->bi_snapshot, NULL, &buf);
- if (ret)
- return ret;
-
- bch_info(c, "reattached at %s", buf.buf);
- }
-
- /*
- * Fix up inodes in child snapshots: if they should also be reattached
- * update the backpointer field, if they should not be we need to emit
- * whiteouts for the dirent we just created.
- */
- if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
- snapshot_id_list whiteouts_done;
- struct btree_iter iter;
- struct bkey_s_c k;
-
- darray_init(&whiteouts_done);
-
- for_each_btree_key_reverse_norestart(trans, iter,
- BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
- BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
- if (k.k->p.offset != inode->bi_inum)
- break;
-
- if (!bkey_is_inode(k.k) ||
- !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
- snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
- continue;
-
- struct bch_inode_unpacked child_inode;
- ret = bch2_inode_unpack(k, &child_inode);
- if (ret)
- break;
-
- if (!inode_should_reattach(&child_inode)) {
- ret = maybe_delete_dirent(trans,
- SPOS(lostfound.bi_inum, inode->bi_dir_offset,
- dirent_snapshot),
- k.k->p.snapshot);
- if (ret)
- break;
-
- ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
- if (ret)
- break;
- } else {
- iter.snapshot = k.k->p.snapshot;
- child_inode.bi_dir = inode->bi_dir;
- child_inode.bi_dir_offset = inode->bi_dir_offset;
-
- ret = bch2_inode_write_flags(trans, &iter, &child_inode,
- BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- break;
- }
- }
- darray_exit(&whiteouts_done);
- bch2_trans_iter_exit(trans, &iter);
- }
-
- return ret;
-}
-
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos)
-{
- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static int remove_backpointer(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
-{
- if (!bch2_inode_has_backpointer(inode))
- return 0;
-
- u32 snapshot = inode->bi_snapshot;
-
- if (inode->bi_parent_subvol) {
- int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot);
- if (ret)
- return ret;
- }
-
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
- int ret = bkey_err(d) ?:
- dirent_points_to_inode(c, d, inode) ?:
- bch2_fsck_remove_dirent(trans, d.k->p);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
- struct bch_fs *c = trans->c;
-
- struct bch_inode_unpacked inode;
- int ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &inode);
- if (ret)
- return ret;
-
- ret = remove_backpointer(trans, &inode);
- if (!bch2_err_matches(ret, ENOENT))
- bch_err_msg(c, ret, "removing dirent");
- if (ret)
- return ret;
-
- ret = reattach_inode(trans, &inode);
- bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
- return ret;
-}
-
-static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
-{
- struct bch_fs *c = trans->c;
-
- if (!bch2_snapshot_is_leaf(c, snapshotid)) {
- bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
- return bch_err_throw(c, fsck_repair_unimplemented);
- }
-
- /*
- * If inum isn't set, that means we're being called from check_dirents,
- * not check_inodes - the root of this subvolume doesn't exist or we
- * would have found it there:
- */
- if (!inum) {
- struct btree_iter inode_iter = {};
- struct bch_inode_unpacked new_inode;
- u64 cpu = raw_smp_processor_id();
-
- bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
-
- new_inode.bi_subvol = subvolid;
-
- int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
- bch2_btree_iter_traverse(trans, &inode_iter) ?:
- bch2_inode_write(trans, &inode_iter, &new_inode);
- bch2_trans_iter_exit(trans, &inode_iter);
- if (ret)
- return ret;
-
- inum = new_inode.bi_inum;
- }
-
- bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
-
- struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
- int ret = PTR_ERR_OR_ZERO(new_subvol);
- if (ret)
- return ret;
-
- bkey_subvolume_init(&new_subvol->k_i);
- new_subvol->k.p.offset = subvolid;
- new_subvol->v.snapshot = cpu_to_le32(snapshotid);
- new_subvol->v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
- if (ret)
- return ret;
-
- struct btree_iter iter;
- struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, snapshotid),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
- bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
- if (ret)
- return ret;
-
- u32 snapshot_tree = le32_to_cpu(s->v.tree);
-
- s->v.subvol = cpu_to_le32(subvolid);
- SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
- bch2_trans_iter_exit(trans, &iter);
-
- struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(st);
- bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
- if (ret)
- return ret;
-
- if (!st->v.master_subvol)
- st->v.master_subvol = cpu_to_le32(subvolid);
-
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
-{
- struct bch_fs *c = trans->c;
- unsigned i_mode = S_IFREG;
- u64 i_size = 0;
-
- switch (btree) {
- case BTREE_ID_extents: {
- struct btree_iter iter = {};
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
- bch2_trans_iter_exit(trans, &iter);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- i_size = k.k->p.offset << 9;
- break;
- }
- case BTREE_ID_dirents:
- i_mode = S_IFDIR;
- break;
- case BTREE_ID_xattrs:
- break;
- default:
- BUG();
- }
-
- struct bch_inode_unpacked new_inode;
- bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
- new_inode.bi_size = i_size;
- new_inode.bi_inum = inum;
- new_inode.bi_snapshot = snapshot;
-
- return __bch2_fsck_write_inode(trans, &new_inode);
-}
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
-{
- darray_exit(&s->ids);
-}
-
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
- memset(s, 0, sizeof(*s));
-}
-
-static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
-{
- u32 *i;
- __darray_for_each(s->ids, i) {
- if (*i == id)
- return 0;
- if (*i > id)
- break;
- }
-
- int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
- if (ret)
- bch_err(c, "error reallocating snapshots_seen table (size %zu)",
- s->ids.size);
- return ret;
-}
-
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
- enum btree_id btree_id, struct bpos pos)
-{
- if (!bkey_eq(s->pos, pos))
- s->ids.nr = 0;
- s->pos = pos;
-
- return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
-}
-
-/**
- * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
- * and @ancestor hasn't been overwritten in @seen
- *
- * @c: filesystem handle
- * @seen: list of snapshot ids already seen at current position
- * @id: descendent snapshot id
- * @ancestor: ancestor snapshot id
- *
- * Returns: whether key in @ancestor snapshot is visible in @id snapshot
- */
-static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
- u32 id, u32 ancestor)
-{
- EBUG_ON(id > ancestor);
-
- if (id == ancestor)
- return true;
-
- if (!bch2_snapshot_is_ancestor(c, id, ancestor))
- return false;
-
- /*
- * We know that @id is a descendant of @ancestor, we're checking if
- * we've seen a key that overwrote @ancestor - i.e. also a descendent of
- * @ascestor and with @id as a descendent.
- *
- * But we already know that we're scanning IDs between @id and @ancestor
- * numerically, since snapshot ID lists are kept sorted, so if we find
- * an id that's an ancestor of @id we're done:
- */
- darray_for_each_reverse(seen->ids, i)
- if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i))
- return false;
-
- return true;
-}
-
-/**
- * ref_visible - given a key with snapshot id @src that points to a key with
- * snapshot id @dst, test whether there is some snapshot in which @dst is
- * visible.
- *
- * @c: filesystem handle
- * @s: list of snapshot IDs already seen at @src
- * @src: snapshot ID of src key
- * @dst: snapshot ID of dst key
- * Returns: true if there is some snapshot in which @dst is visible
- *
- * Assumes we're visiting @src keys in natural key order
- */
-static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
- u32 src, u32 dst)
-{
- return dst <= src
- ? key_visible_in_snapshot(c, s, dst, src)
- : bch2_snapshot_is_ancestor(c, src, dst);
-}
-
-static int ref_visible2(struct bch_fs *c,
- u32 src, struct snapshots_seen *src_seen,
- u32 dst, struct snapshots_seen *dst_seen)
-{
- if (dst > src) {
- swap(dst, src);
- swap(dst_seen, src_seen);
- }
- return key_visible_in_snapshot(c, src_seen, dst, src);
-}
-
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
- for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
- (_i)->inode.bi_snapshot <= (_snapshot); _i++) \
- if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
-
-struct inode_walker_entry {
- struct bch_inode_unpacked inode;
- bool whiteout;
- u64 count;
- u64 i_size;
-};
-
-struct inode_walker {
- bool first_this_inode;
- bool have_inodes;
- bool recalculate_sums;
- struct bpos last_pos;
-
- DARRAY(struct inode_walker_entry) inodes;
- snapshot_id_list deletes;
-};
-
-static void inode_walker_exit(struct inode_walker *w)
-{
- darray_exit(&w->inodes);
- darray_exit(&w->deletes);
-}
-
-static struct inode_walker inode_walker_init(void)
-{
- return (struct inode_walker) { 0, };
-}
-
-static int add_inode(struct bch_fs *c, struct inode_walker *w,
- struct bkey_s_c inode)
-{
- int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
- .whiteout = !bkey_is_inode(inode.k),
- }));
- if (ret)
- return ret;
-
- struct inode_walker_entry *n = &darray_last(w->inodes);
- if (!n->whiteout) {
- return bch2_inode_unpack(inode, &n->inode);
- } else {
- n->inode.bi_inum = inode.k->p.offset;
- n->inode.bi_snapshot = inode.k->p.snapshot;
- return 0;
- }
-}
-
-static int get_inodes_all_snapshots(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- /*
- * We no longer have inodes for w->last_pos; clear this to avoid
- * screwing up check_i_sectors/check_subdir_count if we take a
- * transaction restart here:
- */
- w->have_inodes = false;
- w->recalculate_sums = false;
- w->inodes.nr = 0;
-
- for_each_btree_key_max_norestart(trans, iter,
- BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- ret = add_inode(c, w, k);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
-
- w->first_this_inode = true;
- w->have_inodes = true;
- return 0;
-}
-
-static int get_visible_inodes(struct btree_trans *trans,
- struct inode_walker *w,
- struct snapshots_seen *s,
- u64 inum)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- w->inodes.nr = 0;
- w->deletes.nr = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
-
- if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
- continue;
-
- if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
- continue;
-
- ret = bkey_is_inode(k.k)
- ? add_inode(c, w, k)
- : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
-
- struct inode_walker_entry *i = darray_find_p(w->inodes, i,
- bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
-
- if (!i)
- return NULL;
-
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
- trans, snapshot_key_missing_inode_snapshot,
- "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
- "unexpected because we should always update the inode when we update a key in that inode\n"
- "%s",
- w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
- (bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- if (!i->whiteout) {
- struct bch_inode_unpacked new = i->inode;
- new.bi_snapshot = k.k->p.snapshot;
- ret = __bch2_fsck_write_inode(trans, &new);
- } else {
- struct bkey_i whiteout;
- bkey_init(&whiteout.k);
- whiteout.k.type = KEY_TYPE_whiteout;
- whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot);
- ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &whiteout,
- BTREE_UPDATE_internal_snapshot_node);
- }
-
- if (ret)
- goto fsck_err;
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0);
- if (ret)
- goto fsck_err;
-
- struct inode_walker_entry new_entry = *i;
-
- new_entry.inode.bi_snapshot = k.k->p.snapshot;
- new_entry.count = 0;
- new_entry.i_size = 0;
-
- while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
- --i;
-
- size_t pos = i - w->inodes.data;
- ret = darray_insert_item(&w->inodes, pos, new_entry);
- if (ret)
- goto fsck_err;
-
- ret = bch_err_throw(c, transaction_restart_nested);
- goto fsck_err;
- }
-
- printbuf_exit(&buf);
- return i;
-fsck_err:
- printbuf_exit(&buf);
- return ERR_PTR(ret);
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w,
- struct bkey_s_c k)
-{
- if (w->last_pos.inode != k.k->p.inode) {
- int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
- if (ret)
- return ERR_PTR(ret);
- }
-
- w->last_pos = k.k->p;
-
- return lookup_inode_for_snapshot(trans, w, k);
-}
-
-/*
- * Prefer to delete the first one, since that will be the one at the wrong
- * offset:
- * return value: 0 -> delete k1, 1 -> delete k2
- */
-int bch2_fsck_update_backpointers(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_i *new)
-{
- if (new->k.type != KEY_TYPE_dirent)
- return 0;
-
- struct bkey_i_dirent *d = bkey_i_to_dirent(new);
- struct inode_walker target = inode_walker_init();
- int ret = 0;
-
- if (d->v.d_type == DT_SUBVOL) {
- bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- } else {
- ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
- if (ret)
- goto err;
-
- darray_for_each(target.inodes, i) {
- i->inode.bi_dir_offset = d->k.p.offset;
- ret = __bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- goto err;
- }
- }
-err:
- inode_walker_exit(&target);
- return ret;
-}
-
-static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
-{
- if (inode->bi_subvol) {
- u64 inum;
- int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
- if (ret)
- return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
- }
-
- return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
-}
-
-static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
- int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int check_inode_dirent_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- bool *write_inode)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- u32 inode_snapshot = inode->bi_snapshot;
- struct btree_iter dirent_iter = {};
- struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
- int ret = bkey_err(d);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
- inode->bi_subvol &&
- (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
- /* Older version of a renamed subvolume root: we won't have a
- * correct dirent for it. That's expected, see
- * inode_should_reattach().
- *
- * We don't clear the backpointer field when doing the rename
- * because there might be arbitrarily many versions in older
- * snapshots.
- */
- inode->bi_dir = 0;
- inode->bi_dir_offset = 0;
- *write_inode = true;
- goto out;
- }
-
- if (fsck_err_on(ret,
- trans, inode_points_to_missing_dirent,
- "inode points to missing dirent\n%s",
- (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
- fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
- trans, inode_points_to_wrong_dirent,
- "%s",
- (printbuf_reset(&buf),
- dirent_inode_mismatch_msg(&buf, c, d, inode),
- buf.buf))) {
- /*
- * We just clear the backpointer fields for now. If we find a
- * dirent that points to this inode in check_dirents(), we'll
- * update it then; then when we get to check_path() if the
- * backpointer is still 0 we'll reattach it.
- */
- inode->bi_dir = 0;
- inode->bi_dir_offset = 0;
- *write_inode = true;
- }
-out:
- ret = 0;
-fsck_err:
- bch2_trans_iter_exit(trans, &dirent_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_inode_unpacked *snapshot_root,
- struct snapshots_seen *s)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct bch_inode_unpacked u;
- bool do_update = false;
- int ret;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret < 0)
- goto err;
- if (ret)
- return 0;
-
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
- if (ret)
- goto err;
-
- if (!bkey_is_inode(k.k))
- return 0;
-
- ret = bch2_inode_unpack(k, &u);
- if (ret)
- goto err;
-
- if (snapshot_root->bi_inum != u.bi_inum) {
- ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
- if (ret)
- goto err;
- }
-
- if (u.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) {
- ret = bch2_repair_inode_hash_info(trans, snapshot_root);
- BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
- if (ret)
- goto err;
- }
-
- ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
- if (ret)
- goto err;
-
- if (bch2_inode_has_backpointer(&u)) {
- ret = check_inode_dirent_inode(trans, &u, &do_update);
- if (ret)
- goto err;
- }
-
- if (fsck_err_on(bch2_inode_has_backpointer(&u) &&
- (u.bi_flags & BCH_INODE_unlinked),
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- u.bi_flags &= ~BCH_INODE_unlinked;
- do_update = true;
- }
-
- if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
- /* Check for this early so that check_unreachable_inode() will reattach it */
-
- ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
- if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
- goto err;
-
- fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
- "dir unlinked but not empty\n%s",
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf));
- u.bi_flags &= ~BCH_INODE_unlinked;
- do_update = true;
- ret = 0;
- }
-
- if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
- trans, inode_dir_has_nonzero_i_size,
- "directory %llu:%u with nonzero i_size %lli",
- u.bi_inum, u.bi_snapshot, u.bi_size)) {
- u.bi_size = 0;
- do_update = true;
- }
-
- ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
- trans, inode_has_child_snapshots_wrong,
- "inode has_child_snapshots flag wrong (should be %u)\n%s",
- ret,
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- if (ret)
- u.bi_flags |= BCH_INODE_has_child_snapshot;
- else
- u.bi_flags &= ~BCH_INODE_has_child_snapshot;
- do_update = true;
- }
- ret = 0;
-
- if ((u.bi_flags & BCH_INODE_unlinked) &&
- !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
- if (!test_bit(BCH_FS_started, &c->flags)) {
- /*
- * If we're not in online fsck, don't delete unlinked
- * inodes, just make sure they're on the deleted list.
- *
- * They might be referred to by a logged operation -
- * i.e. we might have crashed in the middle of a
- * truncate on an unlinked but open file - so we want to
- * let the delete_dead_inodes kill it after resuming
- * logged ops.
- */
- ret = check_inode_deleted_list(trans, k.k->p);
- if (ret < 0)
- goto err_noprint;
-
- fsck_err_on(!ret,
- trans, unlinked_inode_not_on_deleted_list,
- "inode %llu:%u unlinked, but not on deleted list",
- u.bi_inum, k.k->p.snapshot);
-
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
- if (ret)
- goto err;
- } else {
- ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret,
- trans, inode_unlinked_and_not_open,
- "inode %llu:%u unlinked and not open",
- u.bi_inum, u.bi_snapshot)) {
- ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck deleting inode");
- goto err_noprint;
- }
- ret = 0;
- }
- }
-
- if (fsck_err_on(u.bi_parent_subvol &&
- (u.bi_subvol == 0 ||
- u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
- trans, inode_bi_parent_nonzero,
- "inode %llu:%u has subvol %u but nonzero parent subvol %u",
- u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
- u.bi_parent_subvol = 0;
- do_update = true;
- }
-
- if (u.bi_subvol) {
- struct bch_subvolume s;
-
- ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
- ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
- goto do_update;
- }
-
- if (fsck_err_on(ret,
- trans, inode_bi_subvol_missing,
- "inode %llu:%u bi_subvol points to missing subvolume %u",
- u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
- fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
- !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
- k.k->p.snapshot),
- trans, inode_bi_subvol_wrong,
- "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
- u.bi_inum, k.k->p.snapshot, u.bi_subvol,
- le64_to_cpu(s.inode),
- le32_to_cpu(s.snapshot))) {
- u.bi_subvol = 0;
- u.bi_parent_subvol = 0;
- do_update = true;
- }
- }
-
- if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
- trans, inode_journal_seq_in_future,
- "inode journal seq in future (currently at %llu)\n%s",
- journal_cur_seq(&c->journal),
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &u),
- buf.buf))) {
- u.bi_journal_seq = journal_cur_seq(&c->journal);
- do_update = true;
- }
-do_update:
- if (do_update) {
- ret = __bch2_fsck_write_inode(trans, &u);
- bch_err_msg(c, ret, "in fsck updating inode");
- if (ret)
- goto err_noprint;
- }
-err:
-fsck_err:
- bch_err_fn(c, ret);
-err_noprint:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_inodes(struct bch_fs *c)
-{
- struct bch_inode_unpacked snapshot_root = {};
- struct snapshots_seen s;
-
- snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &snapshot_root, &s)));
-
- snapshots_seen_exit(&s);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- /*
- * We look for inodes to reattach in natural key order, leaves first,
- * but we should do the reattach at the oldest version that needs to be
- * reattached:
- */
- for_each_btree_key_norestart(trans, iter,
- BTREE_ID_inodes,
- SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode->bi_inum)
- break;
-
- if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
- continue;
-
- if (!bkey_is_inode(k.k))
- break;
-
- struct bch_inode_unpacked parent_inode;
- ret = bch2_inode_unpack(k, &parent_inode);
- if (ret)
- break;
-
- if (!inode_should_reattach(&parent_inode))
- break;
-
- *inode = parent_inode;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static int check_unreachable_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (!bkey_is_inode(k.k))
- return 0;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- return ret;
-
- if (!inode_should_reattach(&inode))
- return 0;
-
- ret = find_oldest_inode_needs_reattach(trans, &inode);
- if (ret)
- return ret;
-
- if (fsck_err(trans, inode_unreachable,
- "unreachable inode:\n%s",
- (bch2_inode_unpacked_to_text(&buf, &inode),
- buf.buf)))
- ret = reattach_inode(trans, &inode);
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * Reattach unreachable (but not unlinked) inodes
- *
- * Run after check_inodes() and check_dirents(), so we node that inode
- * backpointer fields point to valid dirents, and every inode that has a dirent
- * that points to it has its backpointer field set - so we're just looking for
- * non-unlinked inodes without backpointers:
- *
- * XXX: this is racy w.r.t. hardlink removal in online fsck
- */
-int bch2_check_unreachable_inodes(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_unreachable_inode(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
-{
- switch (btree) {
- case BTREE_ID_extents:
- return S_ISREG(mode) || S_ISLNK(mode);
- case BTREE_ID_dirents:
- return S_ISDIR(mode);
- case BTREE_ID_xattrs:
- return true;
- default:
- BUG();
- }
-}
-
-static int check_key_has_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct inode_walker *inode,
- struct inode_walker_entry *i,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter iter2 = {};
- int ret = PTR_ERR_OR_ZERO(i);
- if (ret)
- return ret;
-
- if (k.k->type == KEY_TYPE_whiteout)
- goto out;
-
- bool have_inode = i && !i->whiteout;
-
- if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes)))
- goto reconstruct;
-
- if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode))
- goto out;
-
- prt_printf(&buf, ", ");
-
- bool have_old_inode = false;
- darray_for_each(inode->inodes, i2)
- if (!i2->whiteout &&
- bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) &&
- btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) {
- prt_printf(&buf, "but found good inode in older snapshot\n");
- bch2_inode_unpacked_to_text(&buf, &i2->inode);
- prt_newline(&buf);
- have_old_inode = true;
- break;
- }
-
- struct bkey_s_c k2;
- unsigned nr_keys = 0;
-
- prt_printf(&buf, "found keys:\n");
-
- for_each_btree_key_max_norestart(trans, iter2, iter->btree_id,
- SPOS(k.k->p.inode, 0, k.k->p.snapshot),
- POS(k.k->p.inode, U64_MAX),
- 0, k2, ret) {
- nr_keys++;
- if (nr_keys <= 10) {
- bch2_bkey_val_to_text(&buf, c, k2);
- prt_newline(&buf);
- }
- if (nr_keys >= 100)
- break;
- }
-
- if (ret)
- goto err;
-
- if (nr_keys > 100)
- prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys);
- else if (nr_keys > 10)
- prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys);
-
- if (!have_inode) {
- if (fsck_err_on(!have_inode,
- trans, key_in_missing_inode,
- "key in missing inode%s", buf.buf)) {
- /*
- * Maybe a deletion that raced with data move, or something
- * weird like that? But if we know the inode was deleted, or
- * it's just a few keys, we can safely delete them.
- *
- * If it's many keys, we should probably recreate the inode
- */
- if (have_old_inode || nr_keys <= 2)
- goto delete;
- else
- goto reconstruct;
- }
- } else {
- /*
- * not autofix, this one would be a giant wtf - bit error in the
- * inode corrupting i_mode?
- *
- * may want to try repairing inode instead of deleting
- */
- if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
- trans, key_in_wrong_inode_type,
- "key for wrong inode mode %o%s",
- i->inode.bi_mode, buf.buf))
- goto delete;
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &iter2);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-delete:
- /*
- * XXX: print out more info
- * count up extents for this inode, check if we have different inode in
- * an older snapshot version, perhaps decide if we want to reconstitute
- */
- ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
- goto out;
-reconstruct:
- ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- inode->last_pos.inode--;
- ret = bch_err_throw(c, transaction_restart_nested);
- goto out;
-}
-
-static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
- s64 count2;
-
- darray_for_each(w->inodes, i) {
- if (i->inode.bi_sectors == i->count)
- continue;
-
- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
-
- if (w->recalculate_sums)
- i->count = count2;
-
- if (i->count != count2) {
- bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
- i->count = count2;
- }
-
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
- trans, inode_i_sectors_wrong,
- "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
- w->last_pos.inode, i->inode.bi_snapshot,
- i->inode.bi_sectors, i->count)) {
- i->inode.bi_sectors = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
- }
- }
-fsck_err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
-{
- u32 restart_count = trans->restart_count;
- return check_i_sectors_notnested(trans, w) ?:
- trans_was_restarted(trans, restart_count);
-}
-
-struct extent_end {
- u32 snapshot;
- u64 offset;
- struct snapshots_seen seen;
-};
-
-struct extent_ends {
- struct bpos last_pos;
- DARRAY(struct extent_end) e;
-};
-
-static void extent_ends_reset(struct extent_ends *extent_ends)
-{
- darray_for_each(extent_ends->e, i)
- snapshots_seen_exit(&i->seen);
- extent_ends->e.nr = 0;
-}
-
-static void extent_ends_exit(struct extent_ends *extent_ends)
-{
- extent_ends_reset(extent_ends);
- darray_exit(&extent_ends->e);
-}
-
-static void extent_ends_init(struct extent_ends *extent_ends)
-{
- memset(extent_ends, 0, sizeof(*extent_ends));
-}
-
-static int extent_ends_at(struct bch_fs *c,
- struct extent_ends *extent_ends,
- struct snapshots_seen *seen,
- struct bkey_s_c k)
-{
- struct extent_end *i, n = (struct extent_end) {
- .offset = k.k->p.offset,
- .snapshot = k.k->p.snapshot,
- .seen = *seen,
- };
-
- n.seen.ids.data = kmemdup(seen->ids.data,
- sizeof(seen->ids.data[0]) * seen->ids.size,
- GFP_KERNEL);
- if (!n.seen.ids.data)
- return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
-
- __darray_for_each(extent_ends->e, i) {
- if (i->snapshot == k.k->p.snapshot) {
- snapshots_seen_exit(&i->seen);
- *i = n;
- return 0;
- }
-
- if (i->snapshot >= k.k->p.snapshot)
- break;
- }
-
- return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
-}
-
-static int overlapping_extents_found(struct btree_trans *trans,
- enum btree_id btree,
- struct bpos pos1, struct snapshots_seen *pos1_seen,
- struct bkey pos2,
- bool *fixed,
- struct extent_end *extent_end)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter iter1, iter2 = {};
- struct bkey_s_c k1, k2;
- int ret;
-
- BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
-
- bch2_trans_iter_init(trans, &iter1, btree, pos1,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_not_extents);
- k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
- ret = bkey_err(k1);
- if (ret)
- goto err;
-
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k1);
-
- if (!bpos_eq(pos1, k1.k->p)) {
- prt_str(&buf, "\nwanted\n ");
- bch2_bpos_to_text(&buf, pos1);
- prt_str(&buf, "\n");
- bch2_bkey_to_text(&buf, &pos2);
-
- bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
- __func__, buf.buf);
- ret = bch_err_throw(c, internal_fsck_err);
- goto err;
- }
-
- bch2_trans_copy_iter(trans, &iter2, &iter1);
-
- while (1) {
- bch2_btree_iter_advance(trans, &iter2);
-
- k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
- ret = bkey_err(k2);
- if (ret)
- goto err;
-
- if (bpos_ge(k2.k->p, pos2.p))
- break;
- }
-
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k2);
-
- if (bpos_gt(k2.k->p, pos2.p) ||
- pos2.size != k2.k->size) {
- bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
- __func__, buf.buf);
- ret = bch_err_throw(c, internal_fsck_err);
- goto err;
- }
-
- prt_printf(&buf, "\noverwriting %s extent",
- pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
-
- if (fsck_err(trans, extent_overlapping,
- "overlapping extents%s", buf.buf)) {
- struct btree_iter *old_iter = &iter1;
- struct disk_reservation res = { 0 };
-
- if (pos1.snapshot < pos2.p.snapshot) {
- old_iter = &iter2;
- swap(k1, k2);
- }
-
- trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
-
- ret = bch2_trans_update_extent_overwrite(trans, old_iter,
- BTREE_UPDATE_internal_snapshot_node,
- k1, k2) ?:
- bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
- bch2_disk_reservation_put(c, &res);
-
- bch_info(c, "repair ret %s", bch2_err_str(ret));
-
- if (ret)
- goto err;
-
- *fixed = true;
-
- if (pos1.snapshot == pos2.p.snapshot) {
- /*
- * We overwrote the first extent, and did the overwrite
- * in the same snapshot:
- */
- extent_end->offset = bkey_start_offset(&pos2);
- } else if (pos1.snapshot > pos2.p.snapshot) {
- /*
- * We overwrote the first extent in pos2's snapshot:
- */
- ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
- } else {
- /*
- * We overwrote the second extent - restart
- * check_extent() from the top:
- */
- ret = bch_err_throw(c, transaction_restart_nested);
- }
- }
-fsck_err:
-err:
- bch2_trans_iter_exit(trans, &iter2);
- bch2_trans_iter_exit(trans, &iter1);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int check_overlapping_extents(struct btree_trans *trans,
- struct snapshots_seen *seen,
- struct extent_ends *extent_ends,
- struct bkey_s_c k,
- struct btree_iter *iter,
- bool *fixed)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- /* transaction restart, running again */
- if (bpos_eq(extent_ends->last_pos, k.k->p))
- return 0;
-
- if (extent_ends->last_pos.inode != k.k->p.inode)
- extent_ends_reset(extent_ends);
-
- darray_for_each(extent_ends->e, i) {
- if (i->offset <= bkey_start_offset(k.k))
- continue;
-
- if (!ref_visible2(c,
- k.k->p.snapshot, seen,
- i->snapshot, &i->seen))
- continue;
-
- ret = overlapping_extents_found(trans, iter->btree_id,
- SPOS(iter->pos.inode,
- i->offset,
- i->snapshot),
- &i->seen,
- *k.k, fixed, i);
- if (ret)
- goto err;
- }
-
- extent_ends->last_pos = k.k->p;
-err:
- return ret;
-}
-
-static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
- unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (crc_is_encoded(crc) &&
- crc.uncompressed_size > encoded_extent_max_sectors) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
- printbuf_exit(&buf);
- }
-
- return 0;
-}
-
-static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct inode_walker *inode,
- struct snapshots_seen *s,
- struct extent_ends *extent_ends,
- struct disk_reservation *res)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- goto out;
- }
-
- if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
- ret = check_i_sectors(trans, inode);
- if (ret)
- goto err;
- }
-
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
- if (ret)
- goto err;
-
- struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(extent_i);
- if (ret)
- goto err;
-
- ret = check_key_has_inode(trans, iter, inode, extent_i, k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_whiteout) {
- ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
- &inode->recalculate_sums);
- if (ret)
- goto err;
-
- /*
- * Check inodes in reverse order, from oldest snapshots to
- * newest, starting from the inode that matches this extent's
- * snapshot. If we didn't have one, iterate over all inodes:
- */
- for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->inode.bi_snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
- continue;
-
- u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9;
-
- if (fsck_err_on(k.k->p.offset > last_block &&
- !bkey_extent_is_reservation(k),
- trans, extent_past_end_of_inode,
- "extent type past end of inode %llu:%u, i_size %llu\n%s",
- i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?:
- bch2_fpunch_snapshot(trans,
- SPOS(i->inode.bi_inum,
- last_block,
- i->inode.bi_snapshot),
- POS(i->inode.bi_inum, U64_MAX));
- if (ret)
- goto err;
-
- iter->k.type = KEY_TYPE_whiteout;
- break;
- }
- }
- }
-
- ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- if (bkey_extent_is_allocation(k.k)) {
- for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->whiteout ||
- i->inode.bi_snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
- continue;
-
- i->count += k.k->size;
- }
- }
-
- if (k.k->type != KEY_TYPE_whiteout) {
- ret = extent_ends_at(c, extent_ends, s, k);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-int bch2_check_extents(struct bch_fs *c)
-{
- struct inode_walker w = inode_walker_init();
- struct snapshots_seen s;
- struct extent_ends extent_ends;
- struct disk_reservation res = { 0 };
-
- snapshots_seen_init(&s);
- extent_ends_init(&extent_ends);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
- check_extent_overbig(trans, &iter, k);
- })) ?:
- check_i_sectors_notnested(trans, &w));
-
- bch2_disk_reservation_put(c, &res);
- extent_ends_exit(&extent_ends);
- inode_walker_exit(&w);
- snapshots_seen_exit(&s);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_check_indirect_extents(struct bch_fs *c)
-{
- struct disk_reservation res = { 0 };
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
- POS_MIN,
- BTREE_ITER_prefetch, k,
- &res, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
- bch2_disk_reservation_put(c, &res);
- check_extent_overbig(trans, &iter, k);
- })));
-
- bch2_disk_reservation_put(c, &res);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
-{
- struct bch_fs *c = trans->c;
- int ret = 0;
- s64 count2;
-
- darray_for_each(w->inodes, i) {
- if (i->inode.bi_nlink == i->count)
- continue;
-
- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
- if (count2 < 0)
- return count2;
-
- if (i->count != count2) {
- bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
- i->count = count2;
- if (i->inode.bi_nlink == i->count)
- continue;
- }
-
- if (i->inode.bi_nlink != i->count) {
- CLASS(printbuf, buf)();
-
- lockrestart_do(trans,
- bch2_inum_snapshot_to_path(trans, w->last_pos.inode,
- i->inode.bi_snapshot, NULL, &buf));
-
- if (fsck_err_on(i->inode.bi_nlink != i->count,
- trans, inode_dir_wrong_nlink,
- "directory with wrong i_nlink: got %u, should be %llu\n%s",
- i->inode.bi_nlink, i->count, buf.buf)) {
- i->inode.bi_nlink = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
- }
- }
- }
-fsck_err:
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
-{
- u32 restart_count = trans->restart_count;
- return check_subdir_count_notnested(trans, w) ?:
- trans_was_restarted(trans, restart_count);
-}
-
-/* find a subvolume that's a descendent of @snapshot: */
-static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
- bch2_trans_iter_exit(trans, &iter);
- *subvolid = k.k->p.offset;
- goto found;
- }
- }
- if (!ret)
- ret = -ENOENT;
-found:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-noinline_for_stack
-static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c_dirent d)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter subvol_iter = {};
- struct bch_inode_unpacked subvol_root;
- u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
- u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
- u32 parent_snapshot;
- u32 new_parent_subvol = 0;
- u64 parent_inum;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (ret ||
- (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
- int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
- if (ret2 && !bch2_err_matches(ret, ENOENT))
- return ret2;
- }
-
- if (ret &&
- !new_parent_subvol &&
- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
- /*
- * Couldn't find a subvol for dirent's snapshot - but we lost
- * subvols, so we need to reconstruct:
- */
- ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
- if (ret)
- return ret;
-
- parent_snapshot = d.k->p.snapshot;
- }
-
- if (fsck_err_on(ret,
- trans, dirent_to_missing_parent_subvol,
- "dirent parent_subvol points to missing subvolume\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
- fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
- trans, dirent_not_visible_in_parent_subvol,
- "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
- parent_snapshot,
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- if (!new_parent_subvol) {
- bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
- return bch_err_throw(c, fsck_repair_unimplemented);
- }
-
- struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
- ret = PTR_ERR_OR_ZERO(new_dirent);
- if (ret)
- goto err;
-
- new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
- }
-
- struct bkey_s_c_subvolume s =
- bch2_bkey_get_iter_typed(trans, &subvol_iter,
- BTREE_ID_subvolumes, POS(0, target_subvol),
- 0, subvolume);
- ret = bkey_err(s.s_c);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (ret) {
- if (fsck_err(trans, dirent_to_missing_subvol,
- "dirent points to missing subvolume\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
- return bch2_fsck_remove_dirent(trans, d.k->p);
- ret = 0;
- goto out;
- }
-
- if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
- printbuf_reset(&buf);
-
- prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
- parent_subvol);
-
- ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
- le64_to_cpu(s.v->inode) }, &buf);
- if (ret)
- goto err;
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, s.s_c);
-
- if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
- struct bkey_i_subvolume *n =
- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.fs_path_parent = cpu_to_le32(parent_subvol);
- }
- }
-
- u64 target_inum = le64_to_cpu(s.v->inode);
- u32 target_snapshot = le32_to_cpu(s.v->snapshot);
-
- ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
- &subvol_root, 0);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (ret) {
- bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
- ret = bch_err_throw(c, fsck_repair_unimplemented);
- goto err;
- }
-
- if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
- trans, inode_bi_parent_wrong,
- "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
- target_inum,
- subvol_root.bi_parent_subvol, parent_subvol)) {
- subvol_root.bi_parent_subvol = parent_subvol;
- subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
- ret = __bch2_fsck_write_inode(trans, &subvol_root);
- if (ret)
- goto err;
- }
-
- ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
- if (ret)
- goto err;
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &subvol_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_hash_info *hash_info,
- struct inode_walker *dir,
- struct inode_walker *target,
- struct snapshots_seen *s,
- bool *need_second_pass)
-{
- struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- goto out;
- }
-
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
- if (ret)
- goto err;
-
- if (k.k->type == KEY_TYPE_whiteout)
- goto out;
-
- if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
- ret = check_subdir_dirents_count(trans, dir);
- if (ret)
- goto err;
- }
-
- i = walk_inode(trans, dir, k);
- ret = PTR_ERR_OR_ZERO(i);
- if (ret < 0)
- goto err;
-
- ret = check_key_has_inode(trans, iter, dir, i, k);
- if (ret)
- goto err;
-
- if (!i || i->whiteout)
- goto out;
-
- if (dir->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &i->inode);
- dir->first_this_inode = false;
-
- hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
-
- ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
- iter, k, need_second_pass);
- if (ret < 0)
- goto err;
- if (ret) {
- /* dirent has been deleted */
- ret = 0;
- goto out;
- }
-
- if (k.k->type != KEY_TYPE_dirent)
- goto out;
-
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- /* check casefold */
- if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
- trans, dirent_casefold_mismatch,
- "dirent casefold does not match dir casefold\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_parent_subvol)
- : 0,
- };
- u64 target = d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol)
- : le64_to_cpu(d.v->d_inum);
- struct qstr name = bch2_dirent_get_name(d);
-
- struct bkey_i_dirent *new_d =
- bch2_dirent_create_key(trans, hash_info, dir_inum,
- d.v->d_type, &name, NULL, target);
- ret = PTR_ERR_OR_ZERO(new_d);
- if (ret)
- goto out;
-
- new_d->k.p.inode = d.k->p.inode;
- new_d->k.p.snapshot = d.k->p.snapshot;
-
- struct btree_iter dup_iter = {};
- ret = bch2_hash_delete_at(trans,
- bch2_dirent_hash_desc, hash_info, iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_str_hash_repair_key(trans, s,
- &bch2_dirent_hash_desc, hash_info,
- iter, bkey_i_to_s_c(&new_d->k_i),
- &dup_iter, bkey_s_c_null,
- need_second_pass);
- goto out;
- }
-
- if (d.v->d_type == DT_SUBVOL) {
- ret = check_dirent_to_subvol(trans, iter, d);
- if (ret)
- goto err;
- } else {
- ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
- if (ret)
- goto err;
-
- if (fsck_err_on(!target->inodes.nr,
- trans, dirent_to_missing_inode,
- "dirent points to missing inode:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- ret = bch2_fsck_remove_dirent(trans, d.k->p);
- if (ret)
- goto err;
- }
-
- darray_for_each(target->inodes, i) {
- ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
- if (ret)
- goto err;
- }
-
- darray_for_each(target->deletes, i)
- if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
- trans, dirent_to_overwritten_inode,
- "dirent points to inode overwritten in snapshot %u:\n%s",
- *i,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf))) {
- struct btree_iter delete_iter;
- bch2_trans_iter_init(trans, &delete_iter,
- BTREE_ID_dirents,
- SPOS(k.k->p.inode, k.k->p.offset, *i),
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(trans, &delete_iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- hash_info,
- &delete_iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &delete_iter);
- if (ret)
- goto err;
-
- }
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
- if (d.v->d_type == DT_DIR)
- i->count++;
- i->i_size += bkey_bytes(d.k);
- }
-out:
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-int bch2_check_dirents(struct bch_fs *c)
-{
- struct inode_walker dir = inode_walker_init();
- struct inode_walker target = inode_walker_init();
- struct snapshots_seen s;
- struct bch_hash_info hash_info;
- bool need_second_pass = false, did_second_pass = false;
- int ret;
-
- snapshots_seen_init(&s);
-again:
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
- &need_second_pass)) ?:
- check_subdir_count_notnested(trans, &dir));
-
- if (!ret && need_second_pass && !did_second_pass) {
- bch_info(c, "check_dirents requires second pass");
- swap(did_second_pass, need_second_pass);
- goto again;
- }
-
- if (!ret && need_second_pass) {
- bch_err(c, "dirents not repairing");
- ret = -EINVAL;
- }
-
- snapshots_seen_exit(&s);
- inode_walker_exit(&dir);
- inode_walker_exit(&target);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_hash_info *hash_info,
- struct inode_walker *inode)
-{
- struct bch_fs *c = trans->c;
-
- int ret = bch2_check_key_has_snapshot(trans, iter, k);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- struct inode_walker_entry *i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(i);
- if (ret)
- return ret;
-
- ret = check_key_has_inode(trans, iter, inode, i, k);
- if (ret)
- return ret;
-
- if (!i || i->whiteout)
- return 0;
-
- if (inode->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &i->inode);
- inode->first_this_inode = false;
-
- bool need_second_pass = false;
- return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
- iter, k, &need_second_pass);
-}
-
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-int bch2_check_xattrs(struct bch_fs *c)
-{
- struct inode_walker inode = inode_walker_init();
- struct bch_hash_info hash_info;
- int ret = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- check_xattr(trans, &iter, k, &hash_info, &inode)));
-
- inode_walker_exit(&inode);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_root_trans(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct bch_inode_unpacked root_inode;
- u32 snapshot;
- u64 inum;
- int ret;
-
- ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
- "root subvol missing")) {
- struct bkey_i_subvolume *root_subvol =
- bch2_trans_kmalloc(trans, sizeof(*root_subvol));
- ret = PTR_ERR_OR_ZERO(root_subvol);
- if (ret)
- goto err;
-
- snapshot = U32_MAX;
- inum = BCACHEFS_ROOT_INO;
-
- bkey_subvolume_init(&root_subvol->k_i);
- root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_subvol->v.flags = 0;
- root_subvol->v.snapshot = cpu_to_le32(snapshot);
- root_subvol->v.inode = cpu_to_le64(inum);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
- bch_err_msg(c, ret, "writing root subvol");
- if (ret)
- goto err;
- }
-
- ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
- &root_inode, 0);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (mustfix_fsck_err_on(ret,
- trans, root_dir_missing,
- "root directory missing") ||
- mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
- trans, root_inode_not_dir,
- "root inode not a directory")) {
- bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
- 0, NULL);
- root_inode.bi_inum = inum;
- root_inode.bi_snapshot = snapshot;
-
- ret = __bch2_fsck_write_inode(trans, &root_inode);
- bch_err_msg(c, ret, "writing root inode");
- }
-err:
-fsck_err:
- return ret;
-}
-
-/* Get root directory, create if it doesn't exist: */
-int bch2_check_root(struct bch_fs *c)
-{
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_root_trans(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static bool darray_u32_has(darray_u32 *d, u32 v)
-{
- darray_for_each(*d, i)
- if (*i == v)
- return true;
- return false;
-}
-
-static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter parent_iter = {};
- darray_u32 subvol_path = {};
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- subvol_inum start = {
- .subvol = k.k->p.offset,
- .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode),
- };
-
- while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
- ret = darray_push(&subvol_path, k.k->p.offset);
- if (ret)
- goto err;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
- struct bch_inode_unpacked subvol_root;
- ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &subvol_root);
- if (ret)
- break;
-
- u32 parent = le32_to_cpu(s.v->fs_path_parent);
-
- if (darray_u32_has(&subvol_path, parent)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "subvolume loop: ");
-
- ret = bch2_inum_to_path(trans, start, &buf);
- if (ret)
- goto err;
-
- if (fsck_err(trans, subvol_loop, "%s", buf.buf))
- ret = reattach_subvol(trans, s);
- break;
- }
-
- bch2_trans_iter_exit(trans, &parent_iter);
- bch2_trans_iter_init(trans, &parent_iter,
- BTREE_ID_subvolumes, POS(0, parent), 0);
- k = bch2_btree_iter_peek_slot(trans, &parent_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
- trans, subvol_unreachable,
- "unreachable subvolume %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, s.s_c),
- buf.buf))) {
- ret = reattach_subvol(trans, s);
- break;
- }
- }
-fsck_err:
-err:
- printbuf_exit(&buf);
- darray_exit(&subvol_path);
- bch2_trans_iter_exit(trans, &parent_iter);
- return ret;
-}
-
-int bch2_check_subvolume_structure(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol_path(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
- u64 inum, u32 snapshot,
- u32 new_depth)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum, snapshot), 0);
-
- struct bch_inode_unpacked inode;
- int ret = bkey_err(k) ?:
- !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
- : bch2_inode_unpack(k, &inode);
- if (ret)
- goto err;
-
- if (inode.bi_depth != new_depth) {
- inode.bi_depth = new_depth;
- ret = __bch2_fsck_write_inode(trans, &inode) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path,
- u32 snapshot, u32 new_bi_depth)
-{
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- darray_for_each_reverse(*path, i) {
- ret = nested_lockrestart_do(trans,
- bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth));
- bch_err_fn(trans->c, ret);
- if (ret)
- break;
-
- new_bi_depth++;
- }
-
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = {};
- darray_u64 path = {};
- struct printbuf buf = PRINTBUF;
- u32 snapshot = inode_k.k->p.snapshot;
- bool redo_bi_depth = false;
- u32 min_bi_depth = U32_MAX;
- int ret = 0;
-
- struct bpos start = inode_k.k->p;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(inode_k, &inode);
- if (ret)
- return ret;
-
- /*
- * If we're running full fsck, check_dirents() will have already ran,
- * and we shouldn't see any missing backpointers here - otherwise that's
- * handled separately, by check_unreachable_inodes
- */
- while (!inode.bi_subvol &&
- bch2_inode_has_backpointer(&inode)) {
- struct btree_iter dirent_iter;
- struct bkey_s_c_dirent d;
-
- d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot));
- ret = bkey_err(d.s_c);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto out;
-
- if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
- bch2_trans_iter_exit(trans, &dirent_iter);
-
- if (bch2_err_matches(ret, ENOENT)) {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, inode_k);
- bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
- bch2_err_str(ret), buf.buf);
- goto out;
- }
-
- bch2_trans_iter_exit(trans, &dirent_iter);
-
- ret = darray_push(&path, inode.bi_inum);
- if (ret)
- return ret;
-
- bch2_trans_iter_exit(trans, &inode_iter);
- inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
- SPOS(0, inode.bi_dir, snapshot), 0);
-
- struct bch_inode_unpacked parent_inode;
- ret = bkey_err(inode_k) ?:
- !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
- : bch2_inode_unpack(inode_k, &parent_inode);
- if (ret) {
- /* Should have been caught in dirents pass */
- bch_err_msg(c, ret, "error looking up parent directory");
- goto out;
- }
-
- min_bi_depth = parent_inode.bi_depth;
-
- if (parent_inode.bi_depth < inode.bi_depth &&
- min_bi_depth < U16_MAX)
- break;
-
- inode = parent_inode;
- redo_bi_depth = true;
-
- if (darray_find(path, inode.bi_inum)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "directory structure loop in snapshot %u: ",
- snapshot);
-
- ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf);
- if (ret)
- goto out;
-
- if (c->opts.verbose) {
- prt_newline(&buf);
- darray_for_each(path, i)
- prt_printf(&buf, "%llu ", *i);
- }
-
- if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
- ret = remove_backpointer(trans, &inode);
- bch_err_msg(c, ret, "removing dirent");
- if (ret)
- goto out;
-
- ret = reattach_inode(trans, &inode);
- bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
- }
-
- goto out;
- }
- }
-
- if (inode.bi_subvol)
- min_bi_depth = 0;
-
- if (redo_bi_depth)
- ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth);
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &inode_iter);
- darray_exit(&path);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Check for loops in the directory structure: all other connectivity issues
- * have been fixed by prior passes
- */
-int bch2_check_directory_structure(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_intent|
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- if (!S_ISDIR(bkey_inode_mode(k)))
- continue;
-
- if (bch2_inode_flags(k) & BCH_INODE_unlinked)
- continue;
-
- check_path_loop(trans, k);
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-struct nlink_table {
- size_t nr;
- size_t size;
-
- struct nlink {
- u64 inum;
- u32 snapshot;
- u32 count;
- } *d;
-};
-
-static int add_nlink(struct bch_fs *c, struct nlink_table *t,
- u64 inum, u32 snapshot)
-{
- if (t->nr == t->size) {
- size_t new_size = max_t(size_t, 128UL, t->size * 2);
- void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
-
- if (!d) {
- bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
- new_size);
- return bch_err_throw(c, ENOMEM_fsck_add_nlink);
- }
-
- if (t->d)
- memcpy(d, t->d, t->size * sizeof(t->d[0]));
- kvfree(t->d);
-
- t->d = d;
- t->size = new_size;
- }
-
-
- t->d[t->nr++] = (struct nlink) {
- .inum = inum,
- .snapshot = snapshot,
- };
-
- return 0;
-}
-
-static int nlink_cmp(const void *_l, const void *_r)
-{
- const struct nlink *l = _l;
- const struct nlink *r = _r;
-
- return cmp_int(l->inum, r->inum);
-}
-
-static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
- struct nlink_table *links,
- u64 range_start, u64 range_end, u64 inum, u32 snapshot)
-{
- struct nlink *link, key = {
- .inum = inum, .snapshot = U32_MAX,
- };
-
- if (inum < range_start || inum >= range_end)
- return;
-
- link = __inline_bsearch(&key, links->d, links->nr,
- sizeof(links->d[0]), nlink_cmp);
- if (!link)
- return;
-
- while (link > links->d && link[0].inum == link[-1].inum)
- --link;
-
- for (; link < links->d + links->nr && link->inum == inum; link++)
- if (ref_visible(c, s, snapshot, link->snapshot)) {
- link->count++;
- if (link->snapshot >= snapshot)
- break;
- }
-}
-
-noinline_for_stack
-static int check_nlinks_find_hardlinks(struct bch_fs *c,
- struct nlink_table *t,
- u64 start, u64 *end)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_inodes,
- POS(0, start),
- BTREE_ITER_intent|
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- if (!bkey_is_inode(k.k))
- continue;
-
- /* Should never fail, checked by bch2_inode_invalid: */
- struct bch_inode_unpacked u;
- _ret3 = bch2_inode_unpack(k, &u);
- if (_ret3)
- break;
-
- /*
- * Backpointer and directory structure checks are sufficient for
- * directories, since they can't have hardlinks:
- */
- if (S_ISDIR(u.bi_mode))
- continue;
-
- /*
- * Previous passes ensured that bi_nlink is nonzero if
- * it had multiple hardlinks:
- */
- if (!u.bi_nlink)
- continue;
-
- ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
- if (ret) {
- *end = k.k->p.offset;
- ret = 0;
- break;
- }
- 0;
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
- u64 range_start, u64 range_end)
-{
- struct snapshots_seen s;
-
- snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_intent|
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k, ({
- ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
- if (ret)
- break;
-
- if (k.k->type == KEY_TYPE_dirent) {
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-
- if (d.v->d_type != DT_DIR &&
- d.v->d_type != DT_SUBVOL)
- inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
- }
- 0;
- })));
-
- snapshots_seen_exit(&s);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k,
- struct nlink_table *links,
- size_t *idx, u64 range_end)
-{
- struct bch_inode_unpacked u;
- struct nlink *link = &links->d[*idx];
- int ret = 0;
-
- if (k.k->p.offset >= range_end)
- return 1;
-
- if (!bkey_is_inode(k.k))
- return 0;
-
- ret = bch2_inode_unpack(k, &u);
- if (ret)
- return ret;
-
- if (S_ISDIR(u.bi_mode))
- return 0;
-
- if (!u.bi_nlink)
- return 0;
-
- while ((cmp_int(link->inum, k.k->p.offset) ?:
- cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
- BUG_ON(*idx == links->nr);
- link = &links->d[++*idx];
- }
-
- if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
- trans, inode_wrong_nlink,
- "inode %llu type %s has wrong i_nlink (%u, should be %u)",
- u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
- bch2_inode_nlink_get(&u), link->count)) {
- bch2_inode_nlink_set(&u, link->count);
- ret = __bch2_fsck_write_inode(trans, &u);
- }
-fsck_err:
- return ret;
-}
-
-noinline_for_stack
-static int check_nlinks_update_hardlinks(struct bch_fs *c,
- struct nlink_table *links,
- u64 range_start, u64 range_end)
-{
- size_t idx = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS(0, range_start),
- BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
- if (ret < 0) {
- bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
- return ret;
- }
-
- return 0;
-}
-
-int bch2_check_nlinks(struct bch_fs *c)
-{
- struct nlink_table links = { 0 };
- u64 this_iter_range_start, next_iter_range_start = 0;
- int ret = 0;
-
- do {
- this_iter_range_start = next_iter_range_start;
- next_iter_range_start = U64_MAX;
-
- ret = check_nlinks_find_hardlinks(c, &links,
- this_iter_range_start,
- &next_iter_range_start);
-
- ret = check_nlinks_walk_dirents(c, &links,
- this_iter_range_start,
- next_iter_range_start);
- if (ret)
- break;
-
- ret = check_nlinks_update_hardlinks(c, &links,
- this_iter_range_start,
- next_iter_range_start);
- if (ret)
- break;
-
- links.nr = 0;
- } while (next_iter_range_start != U64_MAX);
-
- kvfree(links.d);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_p p;
- struct bkey_i_reflink_p *u;
-
- if (k.k->type != KEY_TYPE_reflink_p)
- return 0;
-
- p = bkey_s_c_to_reflink_p(k);
-
- if (!p.v->front_pad && !p.v->back_pad)
- return 0;
-
- u = bch2_trans_kmalloc(trans, sizeof(*u));
- int ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- bkey_reassemble(&u->k_i, k);
- u->v.front_pad = 0;
- u->v.back_pad = 0;
-
- return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
-}
-
-int bch2_fix_reflink_p(struct bch_fs *c)
-{
- if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
- return 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_extents, POS_MIN,
- BTREE_ITER_intent|BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- fix_reflink_p_key(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-#ifndef NO_BCACHEFS_CHARDEV
-
-struct fsck_thread {
- struct thread_with_stdio thr;
- struct bch_fs *c;
- struct bch_opts opts;
-};
-
-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
-{
- struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- kfree(thr);
-}
-
-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
-{
- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- int ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- return ret;
-
- ret = bch2_fs_start(thr->c);
- if (ret)
- goto err;
-
- if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
- ret |= 1;
- }
- if (test_bit(BCH_FS_error, &c->flags)) {
- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
- ret |= 4;
- }
-err:
- bch2_fs_stop(c);
- return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
- .exit = bch2_fsck_thread_exit,
- .fn = bch2_fsck_offline_thread_fn,
-};
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
-{
- struct bch_ioctl_fsck_offline arg;
- struct fsck_thread *thr = NULL;
- darray_const_str devs = {};
- long ret = 0;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- for (size_t i = 0; i < arg.nr_devs; i++) {
- u64 dev_u64;
- ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
- if (ret)
- goto err;
-
- char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
- ret = PTR_ERR_OR_ZERO(dev_str);
- if (ret)
- goto err;
-
- ret = darray_push(&devs, dev_str);
- if (ret) {
- kfree(dev_str);
- goto err;
- }
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
- if (!IS_ERR(optstr))
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
- opt_set(thr->opts, read_only, 1);
- opt_set(thr->opts, ratelimit_errors, 0);
-
- /* We need request_key() to be called before we punt to kthread: */
- opt_set(thr->opts, nostart, true);
-
- bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
-
- thr->c = bch2_fs_open(&devs, &thr->opts);
-
- if (!IS_ERR(thr->c) &&
- thr->c->opts.errors == BCH_ON_ERROR_panic)
- thr->c->opts.errors = BCH_ON_ERROR_ro;
-
- ret = __bch2_run_thread_with_stdio(&thr->thr);
-out:
- darray_for_each(devs, i)
- kfree(*i);
- darray_exit(&devs);
- return ret;
-err:
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- goto out;
-}
-
-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
-{
- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = thr->c;
-
- c->stdio_filter = current;
- c->stdio = &thr->thr.stdio;
-
- /*
- * XXX: can we figure out a way to do this without mucking with c->opts?
- */
- unsigned old_fix_errors = c->opts.fix_errors;
- if (opt_defined(thr->opts, fix_errors))
- c->opts.fix_errors = thr->opts.fix_errors;
- else
- c->opts.fix_errors = FSCK_FIX_ask;
-
- c->opts.fsck = true;
- set_bit(BCH_FS_in_fsck, &c->flags);
-
- int ret = bch2_run_online_recovery_passes(c, ~0ULL);
-
- clear_bit(BCH_FS_in_fsck, &c->flags);
- bch_err_fn(c, ret);
-
- c->stdio = NULL;
- c->stdio_filter = NULL;
- c->opts.fix_errors = old_fix_errors;
-
- up(&c->recovery.run_lock);
- bch2_ro_ref_put(c);
- return ret;
-}
-
-static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
- .exit = bch2_fsck_thread_exit,
- .fn = bch2_fsck_online_thread_fn,
-};
-
-long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
-{
- struct fsck_thread *thr = NULL;
- long ret = 0;
-
- if (arg.flags)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!bch2_ro_ref_tryget(c))
- return -EROFS;
-
- if (down_trylock(&c->recovery.run_lock)) {
- bch2_ro_ref_put(c);
- return -EAGAIN;
- }
-
- thr = kzalloc(sizeof(*thr), GFP_KERNEL);
- if (!thr) {
- ret = -ENOMEM;
- goto err;
- }
-
- thr->c = c;
- thr->opts = bch2_opts_empty();
-
- if (arg.opts) {
- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
- ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
- if (!IS_ERR(optstr))
- kfree(optstr);
-
- if (ret)
- goto err;
- }
-
- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
-err:
- if (ret < 0) {
- bch_err_fn(c, ret);
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- up(&c->recovery.run_lock);
- bch2_ro_ref_put(c);
- }
- return ret;
-}
-
-#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
deleted file mode 100644
index e5fe7cf7b251..000000000000
--- a/fs/bcachefs/fsck.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FSCK_H
-#define _BCACHEFS_FSCK_H
-
-#include "str_hash.h"
-
-/* recoverds snapshot IDs of overwrites at @pos */
-struct snapshots_seen {
- struct bpos pos;
- snapshot_id_list ids;
-};
-
-int bch2_fsck_update_backpointers(struct btree_trans *,
- struct snapshots_seen *,
- const struct bch_hash_desc,
- struct bch_hash_info *,
- struct bkey_i *);
-
-int bch2_check_inodes(struct bch_fs *);
-int bch2_check_extents(struct bch_fs *);
-int bch2_check_indirect_extents(struct bch_fs *);
-int bch2_check_dirents(struct bch_fs *);
-int bch2_check_xattrs(struct bch_fs *);
-int bch2_check_root(struct bch_fs *);
-int bch2_check_subvolume_structure(struct bch_fs *);
-int bch2_check_unreachable_inodes(struct bch_fs *);
-int bch2_check_directory_structure(struct bch_fs *);
-int bch2_check_nlinks(struct bch_fs *);
-int bch2_fix_reflink_p(struct bch_fs *);
-
-long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *);
-long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online);
-
-#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
deleted file mode 100644
index ef4cc7395b86..000000000000
--- a/fs/bcachefs/inode.c
+++ /dev/null
@@ -1,1566 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_write_buffer.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "compress.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "fs.h"
-#include "inode.h"
-#include "namei.h"
-#include "opts.h"
-#include "str_hash.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "varint.h"
-
-#include <linux/random.h>
-
-#include <linux/unaligned.h>
-
-#define x(name, ...) #name,
-const char * const bch2_inode_opts[] = {
- BCH_INODE_OPTS()
- NULL,
-};
-
-static const char * const bch2_inode_flag_strs[] = {
- BCH_INODE_FLAGS()
- NULL
-};
-#undef x
-
-static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
-static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-
-static int inode_decode_field(const u8 *in, const u8 *end,
- u64 out[2], unsigned *out_bits)
-{
- __be64 be[2] = { 0, 0 };
- unsigned bytes, shift;
- u8 *p;
-
- if (in >= end)
- return -BCH_ERR_inode_unpack_error;
-
- if (!*in)
- return -BCH_ERR_inode_unpack_error;
-
- /*
- * position of highest set bit indicates number of bytes:
- * shift = number of bits to remove in high byte:
- */
- shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
- bytes = byte_table[shift - 1];
-
- if (in + bytes > end)
- return -BCH_ERR_inode_unpack_error;
-
- p = (u8 *) be + 16 - bytes;
- memcpy(p, in, bytes);
- *p ^= (1 << 8) >> shift;
-
- out[0] = be64_to_cpu(be[0]);
- out[1] = be64_to_cpu(be[1]);
- *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
-
- return bytes;
-}
-
-static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- struct bkey_i_inode_v3 *k = &packed->inode;
- u8 *out = k->v.fields;
- u8 *end = (void *) &packed[1];
- u8 *last_nonzero_field = out;
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- unsigned bytes;
- int ret;
-
- bkey_inode_v3_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
- packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors);
- packed->inode.v.bi_size = cpu_to_le64(inode->bi_size);
- packed->inode.v.bi_version = cpu_to_le64(inode->bi_version);
- SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
- SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
-
-
-#define x(_name, _bits) \
- nr_fields++; \
- \
- if (inode->_name) { \
- ret = bch2_varint_encode_fast(out, inode->_name); \
- out += ret; \
- \
- if (_bits > 64) \
- *out++ = 0; \
- \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- } else { \
- *out++ = 0; \
- \
- if (_bits > 64) \
- *out++ = 0; \
- }
-
- BCH_INODE_FIELDS_v3()
-#undef x
- BUG_ON(out > end);
-
- out = last_nonzero_field;
- nr_fields = last_nonzero_fieldnr;
-
- bytes = out - (u8 *) &packed->inode.v;
- set_bkey_val_bytes(&packed->inode.k, bytes);
- memset_u64s_tail(&packed->inode.v, 0, bytes);
-
- SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- struct bch_inode_unpacked unpacked;
-
- ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
- BUG_ON(ret);
- BUG_ON(unpacked.bi_inum != inode->bi_inum);
- BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
- BUG_ON(unpacked.bi_sectors != inode->bi_sectors);
- BUG_ON(unpacked.bi_size != inode->bi_size);
- BUG_ON(unpacked.bi_version != inode->bi_version);
- BUG_ON(unpacked.bi_mode != inode->bi_mode);
-
-#define x(_name, _bits) if (unpacked._name != inode->_name) \
- panic("unpacked %llu should be %llu", \
- (u64) unpacked._name, (u64) inode->_name);
- BCH_INODE_FIELDS_v3()
-#undef x
- }
-}
-
-void bch2_inode_pack(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- bch2_inode_pack_inlined(packed, inode);
-}
-
-static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
-{
- const u8 *in = inode.v->fields;
- const u8 *end = bkey_val_end(inode);
- u64 field[2];
- unsigned fieldnr = 0, field_bits;
- int ret;
-
-#define x(_name, _bits) \
- if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
- unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
- memset((void *) unpacked + offset, 0, \
- sizeof(*unpacked) - offset); \
- return 0; \
- } \
- \
- ret = inode_decode_field(in, end, field, &field_bits); \
- if (ret < 0) \
- return ret; \
- \
- if (field_bits > sizeof(unpacked->_name) * 8) \
- return -BCH_ERR_inode_unpack_error; \
- \
- unpacked->_name = field[1]; \
- in += ret;
-
- BCH_INODE_FIELDS_v2()
-#undef x
-
- /* XXX: signal if there were more fields than expected? */
- return 0;
-}
-
-static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
- const u8 *in, const u8 *end,
- unsigned nr_fields)
-{
- unsigned fieldnr = 0;
- int ret;
- u64 v[2];
-
-#define x(_name, _bits) \
- if (fieldnr < nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v[0]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- \
- if (_bits > 64) { \
- ret = bch2_varint_decode_fast(in, end, &v[1]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v[1] = 0; \
- } \
- } else { \
- v[0] = v[1] = 0; \
- } \
- \
- unpacked->_name = v[0]; \
- if (v[1] || v[0] != unpacked->_name) \
- return -BCH_ERR_inode_unpack_error; \
- fieldnr++;
-
- BCH_INODE_FIELDS_v2()
-#undef x
-
- /* XXX: signal if there were more fields than expected? */
- return 0;
-}
-
-static int bch2_inode_unpack_v3(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
-{
- struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
- const u8 *in = inode.v->fields;
- const u8 *end = bkey_val_end(inode);
- unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
- unsigned fieldnr = 0;
- int ret;
- u64 v[2];
-
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_snapshot = inode.k->p.snapshot;
- unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
- unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors);
- unpacked->bi_size = le64_to_cpu(inode.v->bi_size);
- unpacked->bi_version = le64_to_cpu(inode.v->bi_version);
- unpacked->bi_mode = INODEv3_MODE(inode.v);
-
-#define x(_name, _bits) \
- if (fieldnr < nr_fields) { \
- ret = bch2_varint_decode_fast(in, end, &v[0]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- \
- if (_bits > 64) { \
- ret = bch2_varint_decode_fast(in, end, &v[1]); \
- if (ret < 0) \
- return ret; \
- in += ret; \
- } else { \
- v[1] = 0; \
- } \
- } else { \
- v[0] = v[1] = 0; \
- } \
- \
- unpacked->_name = v[0]; \
- if (v[1] || v[0] != unpacked->_name) \
- return -BCH_ERR_inode_unpack_error; \
- fieldnr++;
-
- BCH_INODE_FIELDS_v3()
-#undef x
-
- /* XXX: signal if there were more fields than expected? */
- return 0;
-}
-
-static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
-{
- memset(unpacked, 0, sizeof(*unpacked));
-
- switch (k.k->type) {
- case KEY_TYPE_inode: {
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_snapshot = inode.k->p.snapshot;
- unpacked->bi_journal_seq= 0;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
- if (INODEv1_NEW_VARINT(inode.v)) {
- return bch2_inode_unpack_v2(unpacked, inode.v->fields,
- bkey_val_end(inode),
- INODEv1_NR_FIELDS(inode.v));
- } else {
- return bch2_inode_unpack_v1(inode, unpacked);
- }
- break;
- }
- case KEY_TYPE_inode_v2: {
- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
-
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_snapshot = inode.k->p.snapshot;
- unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
- return bch2_inode_unpack_v2(unpacked, inode.v->fields,
- bkey_val_end(inode),
- INODEv2_NR_FIELDS(inode.v));
- }
- default:
- BUG();
- }
-}
-
-int bch2_inode_unpack(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
-{
- return likely(k.k->type == KEY_TYPE_inode_v3)
- ? bch2_inode_unpack_v3(k, unpacked)
- : bch2_inode_unpack_slowpath(k, unpacked);
-}
-
-int __bch2_inode_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags,
- bool warn)
-{
- u32 snapshot;
- int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
- if (ret)
- return ret;
-
- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot),
- flags|BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (ret)
- goto err;
-
- ret = bch2_inode_unpack(k, inode);
- if (ret)
- goto err;
-
- return 0;
-err:
- if (warn)
- bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
- u64 inode_nr, u32 snapshot,
- struct bch_inode_unpacked *inode,
- unsigned flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inode_nr, snapshot), flags);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- ret = bkey_is_inode(k.k)
- ? bch2_inode_unpack(k, inode)
- : -BCH_ERR_ENOENT_inode;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
-int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
- struct bch_inode_unpacked *root)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k)) {
- ret = bch2_inode_unpack(k, root);
- goto out;
- }
- }
- /* We're only called when we know we have an inode for @inum */
- BUG_ON(!ret);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_write_flags(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_inode_buf *inode_p;
-
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack_inlined(inode_p, inode);
- inode_p->inode.k.p.snapshot = iter->snapshot;
- return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
- struct bkey_inode_buf *inode_p =
- bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode);
- inode_p->inode.k.p.snapshot = inode->bi_snapshot;
-
- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
- &inode_p->inode.k_i,
- BTREE_UPDATE_internal_snapshot_node);
-}
-
-int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_fsck_write_inode(trans, inode));
- bch_err_fn(trans->c, ret);
- return ret;
-}
-
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
-{
- struct bch_inode_unpacked u;
- struct bkey_inode_buf *inode_p;
- int ret;
-
- if (!bkey_is_inode(&k->k))
- return ERR_PTR(-ENOENT);
-
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return ERR_CAST(inode_p);
-
- ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
- if (ret)
- return ERR_PTR(ret);
-
- bch2_inode_pack(inode_p, &u);
- return &inode_p->inode.k_i;
-}
-
-static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bch_inode_unpacked unpacked;
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode,
- c, inode_pos_inode_nonzero,
- "nonzero k.p.inode");
-
- bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
- c, inode_pos_blockdev_range,
- "fs inode in blockdev range");
-
- bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
- c, inode_unpack_error,
- "invalid variable length fields");
-
- bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
- c, inode_checksum_type_invalid,
- "invalid data checksum type (%u >= %u",
- unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-
- bkey_fsck_err_on(unpacked.bi_compression &&
- !bch2_compression_opt_valid(unpacked.bi_compression - 1),
- c, inode_compression_type_invalid,
- "invalid compression opt %u", unpacked.bi_compression - 1);
-
- bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
- unpacked.bi_nlink != 0,
- c, inode_unlinked_but_nlink_nonzero,
- "flagged as unlinked but bi_nlink != 0");
-
- bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
- c, inode_subvol_root_but_not_dir,
- "subvolume root but not a directory");
-fsck_err:
- return ret;
-}
-
-int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- int ret = 0;
-
- bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
- c, inode_str_hash_invalid,
- "invalid str hash type (%llu >= %u)",
- INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
- ret = __bch2_inode_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
- int ret = 0;
-
- bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
- c, inode_str_hash_invalid,
- "invalid str hash type (%llu >= %u)",
- INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
- ret = __bch2_inode_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
- int ret = 0;
-
- bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
- INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
- c, inode_v3_fields_start_bad,
- "invalid fields_start (got %llu, min %u max %zu)",
- INODEv3_FIELDS_START(inode.v),
- INODEv3_FIELDS_START_INITIAL,
- bkey_val_u64s(inode.k));
-
- bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
- c, inode_str_hash_invalid,
- "invalid str hash type (%llu >= %u)",
- INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
-
- ret = __bch2_inode_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-static void __bch2_inode_unpacked_to_text(struct printbuf *out,
- struct bch_inode_unpacked *inode)
-{
- prt_printf(out, "\n");
- printbuf_indent_add(out, 2);
- prt_printf(out, "mode=%o\n", inode->bi_mode);
-
- prt_str(out, "flags=");
- prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, "(%x)\n", inode->bi_flags);
-
- prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
- prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
- prt_printf(out, "hash_type=");
- bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
- prt_newline(out);
- prt_printf(out, "bi_size=%llu\n", inode->bi_size);
- prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
- prt_printf(out, "bi_version=%llu\n", inode->bi_version);
-
-#define x(_name, _bits) \
- prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
- BCH_INODE_FIELDS_v3()
-#undef x
-
- bch2_printbuf_strip_trailing_newline(out);
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
-{
- prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
- __bch2_inode_unpacked_to_text(out, inode);
-}
-
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bch_inode_unpacked inode;
-
- if (bch2_inode_unpack(k, &inode)) {
- prt_printf(out, "(unpack error)");
- return;
- }
-
- __bch2_inode_unpacked_to_text(out, &inode);
-}
-
-static inline u64 bkey_inode_flags(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
- case KEY_TYPE_inode_v2:
- return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
- case KEY_TYPE_inode_v3:
- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
- default:
- return 0;
- }
-}
-
-static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
- return;
- case KEY_TYPE_inode_v2:
- bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
- return;
- case KEY_TYPE_inode_v3:
- bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
- return;
- default:
- BUG();
- }
-}
-
-static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
-{
- unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
-
- return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
-}
-
-static struct bkey_s_c
-bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_id btree, struct bpos pos,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_max_norestart(trans, *iter, btree,
- bpos_successor(pos),
- SPOS(pos.inode, pos.offset, U32_MAX),
- flags|BTREE_ITER_all_snapshots, k, ret)
- if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-static struct bkey_s_c
-bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos pos, unsigned flags)
-{
- struct bkey_s_c k;
-again:
- k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
- if (!k.k ||
- bkey_err(k) ||
- bkey_is_inode(k.k))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
- pos = k.k->p;
- goto again;
-}
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_max_norestart(trans, iter,
- BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
- BTREE_ITER_all_snapshots|
- BTREE_ITER_with_updates, k, ret)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
- bkey_is_inode(k.k)) {
- ret = 1;
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int update_inode_has_children(struct btree_trans *trans,
- struct bkey_s k,
- bool have_child)
-{
- if (!have_child) {
- int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret)
- return ret < 0 ? ret : 0;
- }
-
- u64 f = bkey_inode_flags(k.s_c);
- if (have_child != !!(f & BCH_INODE_has_child_snapshot))
- bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
-
- return 0;
-}
-
-static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
- bool have_child)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
- &iter, pos, BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k)
- return 0;
-
- if (!have_child) {
- ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- goto err;
- }
- }
-
- u64 f = bkey_inode_flags(k);
- if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
- struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
- BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- goto err;
-
- bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_trigger_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
-
- if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
- BUG_ON(!trans->journal_res.seq);
- bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
- }
-
- s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
- int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
- if (ret)
- return ret;
- }
-
- if (flags & BTREE_TRIGGER_transactional) {
- int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
- (int) bkey_is_unlinked_inode(old);
- if (unlinked_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
- new.k->p, unlinked_delta > 0);
- if (ret)
- return ret;
- }
-
- /*
- * If we're creating or deleting an inode at this snapshot ID,
- * and there might be an inode in a parent snapshot ID, we might
- * need to set or clear the has_child_snapshot flag on the
- * parent.
- */
- int deleted_delta = (int) bkey_is_inode(new.k) -
- (int) bkey_is_inode(old.k);
- if (deleted_delta &&
- bch2_snapshot_parent(c, new.k->p.snapshot)) {
- int ret = update_parent_inode_has_children(trans, new.k->p,
- deleted_delta > 0);
- if (ret)
- return ret;
- }
-
- /*
- * When an inode is first updated in a new snapshot, we may need
- * to clear has_child_snapshot
- */
- if (deleted_delta > 0) {
- int ret = update_inode_has_children(trans, new, false);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode,
- c, inode_pos_inode_nonzero,
- "nonzero k.p.inode");
-fsck_err:
- return ret;
-}
-
-void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
-
- prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
-}
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
- c, inode_alloc_cursor_inode_bad,
- "k.p.inode bad");
-fsck_err:
- return ret;
-}
-
-void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
-
- prt_printf(out, "idx %llu generation %llu",
- le64_to_cpu(i.v->idx),
- le64_to_cpu(i.v->gen));
-}
-
-void bch2_inode_init_early(struct bch_fs *c,
- struct bch_inode_unpacked *inode_u)
-{
- enum bch_str_hash_type str_hash =
- bch2_str_hash_opt_to_type(c, c->opts.str_hash);
-
- memset(inode_u, 0, sizeof(*inode_u));
-
- SET_INODE_STR_HASH(inode_u, str_hash);
- get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
-}
-
-void bch2_inode_init_late(struct bch_fs *c,
- struct bch_inode_unpacked *inode_u, u64 now,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct bch_inode_unpacked *parent)
-{
- inode_u->bi_mode = mode;
- inode_u->bi_uid = uid;
- inode_u->bi_gid = gid;
- inode_u->bi_dev = rdev;
- inode_u->bi_atime = now;
- inode_u->bi_mtime = now;
- inode_u->bi_ctime = now;
- inode_u->bi_otime = now;
-
- if (parent && parent->bi_mode & S_ISGID) {
- inode_u->bi_gid = parent->bi_gid;
- if (S_ISDIR(mode))
- inode_u->bi_mode |= S_ISGID;
- }
-
- if (parent) {
-#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name;
- BCH_INODE_OPTS()
-#undef x
- }
-
- if (!S_ISDIR(mode))
- inode_u->bi_casefold = 0;
-
- if (bch2_inode_casefold(c, inode_u))
- inode_u->bi_flags |= BCH_INODE_has_case_insensitive;
-}
-
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct bch_inode_unpacked *parent)
-{
- bch2_inode_init_early(c, inode_u);
- bch2_inode_init_late(c, inode_u, bch2_current_time(c),
- uid, gid, mode, rdev, parent);
-}
-
-static struct bkey_i_inode_alloc_cursor *
-bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
-{
- struct bch_fs *c = trans->c;
-
- u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
-
- cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
- BTREE_ID_logged_ops,
- POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
- BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (ret)
- return ERR_PTR(ret);
-
- struct bkey_i_inode_alloc_cursor *cursor =
- k.k->type == KEY_TYPE_inode_alloc_cursor
- ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
- : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
- ret = PTR_ERR_OR_ZERO(cursor);
- if (ret)
- goto err;
-
- if (c->opts.inodes_32bit) {
- *min = BLOCKDEV_INODE_MAX;
- *max = INT_MAX;
- } else {
- cursor->v.bits = c->opts.shard_inode_numbers_bits;
-
- unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
-
- *min = max(cpu << bits, (u64) INT_MAX + 1);
- *max = (cpu << bits) | ~(ULLONG_MAX << bits);
- }
-
- if (le64_to_cpu(cursor->v.idx) < *min)
- cursor->v.idx = cpu_to_le64(*min);
-
- if (le64_to_cpu(cursor->v.idx) >= *max) {
- cursor->v.idx = cpu_to_le64(*min);
- le32_add_cpu(&cursor->v.gen, 1);
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret ? ERR_PTR(ret) : cursor;
-}
-
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
-{
- u64 min, max;
- struct bkey_i_inode_alloc_cursor *cursor =
- bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
- int ret = PTR_ERR_OR_ZERO(cursor);
- if (ret)
- return ret;
-
- u64 start = le64_to_cpu(cursor->v.idx);
- u64 pos = start;
-
- bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_all_snapshots|
- BTREE_ITER_intent);
- struct bkey_s_c k;
-again:
- while ((k = bch2_btree_iter_peek(trans, iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_lt(k.k->p, POS(0, max))) {
- if (pos < iter->pos.offset)
- goto found_slot;
-
- /*
- * We don't need to iterate over keys in every snapshot once
- * we've found just one:
- */
- pos = iter->pos.offset + 1;
- bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
- }
-
- if (!ret && pos < max)
- goto found_slot;
-
- if (!ret && start == min)
- ret = bch_err_throw(trans->c, ENOSPC_inode_create);
-
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
-
- /* Retry from start */
- pos = start = min;
- bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
- le32_add_cpu(&cursor->v.gen, 1);
- goto again;
-found_slot:
- bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
- k = bch2_btree_iter_peek_slot(trans, iter);
- ret = bkey_err(k);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
-
- inode_u->bi_inum = k.k->p.offset;
- inode_u->bi_generation = le64_to_cpu(cursor->v.gen);
- cursor->v.idx = cpu_to_le64(k.k->p.offset + 1);
- return 0;
-}
-
-static int bch2_inode_delete_keys(struct btree_trans *trans,
- subvol_inum inum, enum btree_id id)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i delete;
- struct bpos end = POS(inum.inum, U64_MAX);
- u32 snapshot;
- int ret = 0;
-
- /*
- * We're never going to be deleting partial extents, no need to use an
- * extent iterator:
- */
- bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_intent);
-
- while (1) {
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
- k = bch2_btree_iter_peek_max(trans, &iter, end);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k)
- break;
-
- bkey_init(&delete.k);
- delete.k.p = iter.pos;
-
- if (iter.flags & BTREE_ITER_is_extents)
- bch2_key_resize(&delete.k,
- bpos_min(end, k.k->p).offset -
- iter.pos.offset);
-
- ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-err:
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = {};
- struct bkey_s_c k;
- u32 snapshot;
- int ret;
-
- ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
- if (ret)
- goto err2;
-
- /*
- * If this was a directory, there shouldn't be any real dirents left -
- * but there could be whiteouts (from hash collisions) that we should
- * delete:
- *
- * XXX: the dirent code ideally would delete whiteouts when they're no
- * longer needed
- */
- ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
- bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
- bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
- if (ret)
- goto err2;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot),
- BTREE_ITER_intent|BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(c,
- "inode %llu:%u not found when deleting",
- inum.inum, snapshot);
- ret = bch_err_throw(c, ENOENT_inode);
- goto err;
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (ret)
- goto err2;
-
- ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
-err2:
- bch2_trans_put(trans);
- return ret;
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
- if (bi->bi_flags & BCH_INODE_unlinked)
- bi->bi_flags &= ~BCH_INODE_unlinked;
- else {
- if (bi->bi_nlink == U32_MAX)
- return -EINVAL;
-
- bi->bi_nlink++;
- }
-
- return 0;
-}
-
-void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
-{
- if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
- bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
- bi->bi_inum);
- return;
- }
-
- if (bi->bi_flags & BCH_INODE_unlinked) {
- bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
- return;
- }
-
- if (bi->bi_nlink)
- bi->bi_nlink--;
- else
- bi->bi_flags |= BCH_INODE_unlinked;
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
-{
- struct bch_opts ret = { 0 };
-#define x(_name, _bits) \
- if (inode->bi_##_name) \
- opt_set(ret, _name, inode->bi_##_name - 1);
- BCH_INODE_OPTS()
-#undef x
- return ret;
-}
-
-void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
- struct bch_inode_unpacked *inode)
-{
-#define x(_name, _bits) \
- if ((inode)->bi_##_name) { \
- opts->_name = inode->bi_##_name - 1; \
- opts->_name##_from_inode = true; \
- } else { \
- opts->_name = c->opts._name; \
- opts->_name##_from_inode = false; \
- }
- BCH_INODE_OPTS()
-#undef x
-
- bch2_io_opts_fixups(opts);
-}
-
-int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
-{
- struct bch_inode_unpacked inode;
- int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
-
- if (ret)
- return ret;
-
- bch2_inode_opts_get(opts, trans->c, &inode);
- return 0;
-}
-
-int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *bi, unsigned v)
-{
- struct bch_fs *c = trans->c;
-
-#ifndef CONFIG_UNICODE
- bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
- return -EOPNOTSUPP;
-#endif
-
- if (c->opts.casefold_disabled)
- return -EOPNOTSUPP;
-
- int ret = 0;
- /* Not supported on individual files. */
- if (!S_ISDIR(bi->bi_mode))
- return -EOPNOTSUPP;
-
- /*
- * Make sure the dir is empty, as otherwise we'd need to
- * rehash everything and update the dirent keys.
- */
- ret = bch2_empty_dir_trans(trans, inum);
- if (ret < 0)
- return ret;
-
- ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
- if (ret)
- return ret;
-
- bch2_check_set_feature(c, BCH_FEATURE_casefolding);
-
- bi->bi_casefold = v + 1;
- bi->bi_fields_set |= BIT(Inode_opt_casefold);
-
- return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
-}
-
-static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter = {};
- struct bkey_i_inode_generation delete;
- struct bch_inode_unpacked inode_u;
- struct bkey_s_c k;
- int ret;
-
- do {
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL);
- } while (ret == -BCH_ERR_transaction_restart_nested);
- if (ret)
- goto err;
-retry:
- bch2_trans_begin(trans);
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum, snapshot), BTREE_ITER_intent);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(c,
- "inode %llu:%u not found when deleting",
- inum, snapshot);
- ret = bch_err_throw(c, ENOENT_inode);
- goto err;
- }
-
- bch2_inode_unpack(k, &inode_u);
-
- /* Subvolume root? */
- if (inode_u.bi_subvol)
- bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter.pos;
- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- return ret ?: -BCH_ERR_transaction_restart_nested;
-}
-
-/*
- * After deleting an inode, there may be versions in older snapshots that should
- * also be deleted - if they're not referenced by sibling snapshots and not open
- * in other subvolumes:
- */
-static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-next_parent:
- ret = lockrestart_do(trans,
- bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
- if (ret || !k.k)
- return ret;
-
- bool unlinked = bkey_is_unlinked_inode(k);
- pos = k.k->p;
- bch2_trans_iter_exit(trans, &iter);
-
- if (!unlinked)
- return 0;
-
- ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
- if (ret)
- return ret < 0 ? ret : 0;
-
- ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
- if (ret)
- return ret;
- goto next_parent;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
-{
- return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
- delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
-}
-
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
- bool from_deleted_inodes)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter inode_iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
- if (fsck_err_on(from_deleted_inodes && ret,
- trans, deleted_inode_missing,
- "nonexistent inode %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
- if (ret)
- goto out;
-
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- goto out;
-
- if (S_ISDIR(inode.bi_mode)) {
- ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
- if (fsck_err_on(from_deleted_inodes &&
- bch2_err_matches(ret, ENOTEMPTY),
- trans, deleted_inode_is_dir,
- "non empty directory %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
- if (ret)
- goto out;
- }
-
- ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
- if (fsck_err_on(from_deleted_inodes && ret,
- trans, deleted_inode_not_unlinked,
- "non-deleted inode %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
- if (ret)
- goto out;
-
- ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
- ? 0 : bch_err_throw(c, inode_has_child_snapshot);
-
- if (fsck_err_on(from_deleted_inodes && ret,
- trans, deleted_inode_has_child_snapshots,
- "inode with child snapshots %llu:%u in deleted_inodes btree",
- pos.offset, pos.snapshot))
- goto delete;
- if (ret)
- goto out;
-
- ret = bch2_inode_has_child_snapshots(trans, k.k->p);
- if (ret < 0)
- goto out;
-
- if (ret) {
- if (fsck_err(trans, inode_has_child_snapshots_wrong,
- "inode has_child_snapshots flag wrong (should be set)\n%s",
- (printbuf_reset(&buf),
- bch2_inode_unpacked_to_text(&buf, &inode),
- buf.buf))) {
- inode.bi_flags |= BCH_INODE_has_child_snapshot;
- ret = __bch2_fsck_write_inode(trans, &inode);
- if (ret)
- goto out;
- }
-
- if (!from_deleted_inodes) {
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- bch_err_throw(c, inode_has_child_snapshot);
- goto out;
- }
-
- goto delete;
-
- }
-
- if (from_deleted_inodes) {
- if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
- !fsck_err(trans, deleted_inode_but_clean,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot)) {
- ret = 0;
- goto out;
- }
-
- ret = 1;
- }
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, &inode_iter);
- printbuf_exit(&buf);
- return ret;
-delete:
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
- goto out;
-}
-
-static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
-{
- u32 snapshot;
-
- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
- may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
-}
-
-int bch2_delete_dead_inodes(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- int ret;
-
- /*
- * if we ran check_inodes() unlinked inodes will have already been
- * cleaned up but the write buffer will be out of sync; therefore we
- * alway need a write buffer flush
- */
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- /*
- * Weird transaction restart handling here because on successful delete,
- * bch2_inode_rm_snapshot() will return a nested transaction restart,
- * but we can't retry because the btree write buffer won't have been
- * flushed and we'd spin:
- */
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- ret = may_delete_deleted_inode(trans, k.k->p, true);
- if (ret > 0) {
- bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
- k.k->p.offset, k.k->p.snapshot);
-
- ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
- /*
- * We don't want to loop here: a transaction restart
- * error here means we handled a transaction restart and
- * we're actually done, but if we loop we'll retry the
- * same key because the write buffer hasn't been flushed
- * yet
- */
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
- }
-
- ret;
- }));
-err:
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
deleted file mode 100644
index b8ec3e628d90..000000000000
--- a/fs/bcachefs/inode.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_H
-#define _BCACHEFS_INODE_H
-
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "opts.h"
-#include "snapshot.h"
-
-extern const char * const bch2_inode_opts[];
-
-int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
-
-static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
-{
- return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
- ? __bch2_inode_has_child_snapshots(trans, pos)
- : 0;
-}
-
-int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_inode ((struct bkey_ops) { \
- .key_validate = bch2_inode_validate, \
- .val_to_text = bch2_inode_to_text, \
- .trigger = bch2_trigger_inode, \
- .min_val_size = 16, \
-})
-
-#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
- .key_validate = bch2_inode_v2_validate, \
- .val_to_text = bch2_inode_to_text, \
- .trigger = bch2_trigger_inode, \
- .min_val_size = 32, \
-})
-
-#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
- .key_validate = bch2_inode_v3_validate, \
- .val_to_text = bch2_inode_to_text, \
- .trigger = bch2_trigger_inode, \
- .min_val_size = 48, \
-})
-
-static inline bool bkey_is_inode(const struct bkey *k)
-{
- return k->type == KEY_TYPE_inode ||
- k->type == KEY_TYPE_inode_v2 ||
- k->type == KEY_TYPE_inode_v3;
-}
-
-int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
- .key_validate = bch2_inode_generation_validate, \
- .val_to_text = bch2_inode_generation_to_text, \
- .min_val_size = 8, \
-})
-
-int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \
- .key_validate = bch2_inode_alloc_cursor_validate, \
- .val_to_text = bch2_inode_alloc_cursor_to_text, \
- .min_val_size = 16, \
-})
-
-#if 0
-typedef struct {
- u64 lo;
- u32 hi;
-} __packed __aligned(4) u96;
-#endif
-typedef u64 u96;
-
-struct bch_inode_unpacked {
- u64 bi_inum;
- u32 bi_snapshot;
- u64 bi_journal_seq;
- __le64 bi_hash_seed;
- u64 bi_size;
- u64 bi_sectors;
- u64 bi_version;
- u32 bi_flags;
- u16 bi_mode;
-
-#define x(_name, _bits) u##_bits _name;
- BCH_INODE_FIELDS_v3()
-#undef x
-};
-BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24);
-
-struct bkey_inode_buf {
- struct bkey_i_inode_v3 inode;
-
-#define x(_name, _bits) + 8 + _bits / 8
- u8 _pad[0 + BCH_INODE_FIELDS_v3()];
-#undef x
-};
-
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
-struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
-
-void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-
-int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
-
-static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
-{
- return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
-}
-
-static inline int bch2_inode_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode,
- subvol_inum inum, unsigned flags)
-{
- return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
-}
-
-int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32,
- struct bch_inode_unpacked *, unsigned);
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
- subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
- struct bch_inode_unpacked *);
-
-int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
- struct bch_inode_unpacked *root);
-
-int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
-
-static inline int bch2_inode_write(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode)
-{
- return bch2_inode_write_flags(trans, iter, inode, 0);
-}
-
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
-
-void bch2_inode_init_early(struct bch_fs *,
- struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64,
- uid_t, gid_t, umode_t, dev_t,
- struct bch_inode_unpacked *);
-void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
- uid_t, gid_t, umode_t, dev_t,
- struct bch_inode_unpacked *);
-
-int bch2_inode_create(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, u32, u64);
-
-int bch2_inode_rm(struct bch_fs *, subvol_inum);
-
-#define inode_opt_get(_c, _inode, _name) \
- ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
-
-static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
- enum inode_opt_id id, u64 v)
-{
- switch (id) {
-#define x(_name, ...) \
- case Inode_opt_##_name: \
- inode->bi_##_name = v; \
- break;
- BCH_INODE_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
- enum inode_opt_id id)
-{
- switch (id) {
-#define x(_name, ...) \
- case Inode_opt_##_name: \
- return inode->bi_##_name;
- BCH_INODE_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-static inline u8 mode_to_type(umode_t mode)
-{
- return (mode >> 12) & 15;
-}
-
-static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
-{
- return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
-}
-
-static inline u32 bch2_inode_flags(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
- case KEY_TYPE_inode_v2:
- return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
- case KEY_TYPE_inode_v3:
- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
- default:
- return 0;
- }
-}
-
-static inline unsigned bkey_inode_mode(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode);
- case KEY_TYPE_inode_v2:
- return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode);
- case KEY_TYPE_inode_v3:
- return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v);
- default:
- return 0;
- }
-}
-
-static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
- /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */
- return bi->bi_casefold
- ? bi->bi_casefold - 1
- : c->opts.casefold;
-}
-
-static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi)
-{
- return bi->bi_dir || bi->bi_dir_offset;
-}
-
-/* i_nlink: */
-
-static inline unsigned nlink_bias(umode_t mode)
-{
- return S_ISDIR(mode) ? 2 : 1;
-}
-
-static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
-{
- return bi->bi_flags & BCH_INODE_unlinked
- ? 0
- : bi->bi_nlink + nlink_bias(bi->bi_mode);
-}
-
-static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
- unsigned nlink)
-{
- if (nlink) {
- bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
- bi->bi_flags &= ~BCH_INODE_unlinked;
- } else {
- bi->bi_nlink = 0;
- bi->bi_flags |= BCH_INODE_unlinked;
- }
-}
-
-int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
-void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
-void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
- struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
-int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *, unsigned);
-
-#include "rebalance.h"
-
-static inline struct bch_extent_rebalance
-bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
-{
- struct bch_io_opts io_opts;
- bch2_inode_opts_get(&io_opts, c, inode);
- return io_opts_to_rebalance_opts(c, &io_opts);
-}
-
-#define BCACHEFS_ROOT_SUBVOL_INUM \
- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
-
-static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b)
-{
- return a.subvol == b.subvol && a.inum == b.inum;
-}
-
-int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
-int bch2_delete_dead_inodes(struct bch_fs *);
-
-#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
deleted file mode 100644
index 1f00938b1bdc..000000000000
--- a/fs/bcachefs/inode_format.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_INODE_FORMAT_H
-#define _BCACHEFS_INODE_FORMAT_H
-
-#define BLOCKDEV_INODE_MAX 4096
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8) \
- x(bi_depth, 32) \
- x(bi_inodes_32bit, 8) \
- x(bi_casefold, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8) \
- x(inodes_32bit, 8) \
- x(casefold, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-/*
- * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive -
- * for overlayfs
- */
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8) \
- x(has_child_snapshot, 9) \
- x(has_case_insensitive, 10)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-struct bch_inode_alloc_cursor {
- struct bch_val v;
- __u8 bits;
- __u8 pad;
- __le32 gen;
- __le64 idx;
-};
-
-#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
deleted file mode 100644
index 07023667a475..000000000000
--- a/fs/bcachefs/io_misc.c
+++ /dev/null
@@ -1,570 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * io_misc.c - fallocate, fpunch, truncate:
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "extents.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "logged_ops.h"
-#include "rebalance.h"
-#include "subvolume.h"
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
- subvol_inum inum,
- struct btree_iter *iter,
- u64 sectors,
- struct bch_io_opts opts,
- s64 *i_sectors_delta,
- struct write_point_specifier write_point)
-{
- struct bch_fs *c = trans->c;
- struct disk_reservation disk_res = { 0 };
- struct closure cl;
- struct open_buckets open_buckets = { 0 };
- struct bkey_s_c k;
- struct bkey_buf old, new;
- unsigned sectors_allocated = 0, new_replicas;
- bool unwritten = opts.nocow &&
- c->sb.version >= bcachefs_metadata_version_unwritten_extents;
- int ret;
-
- bch2_bkey_buf_init(&old);
- bch2_bkey_buf_init(&new);
- closure_init_stack(&cl);
-
- k = bch2_btree_iter_peek_slot(trans, iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
- new_replicas = max(0, (int) opts.data_replicas -
- (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-
- /*
- * Get a disk reservation before (in the nocow case) calling
- * into the allocator:
- */
- ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
- if (unlikely(ret))
- goto err_noprint;
-
- bch2_bkey_buf_reassemble(&old, c, k);
-
- if (!unwritten) {
- struct bkey_i_reservation *reservation;
-
- bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
- reservation = bkey_reservation_init(new.k);
- reservation->k.p = iter->pos;
- bch2_key_resize(&reservation->k, sectors);
- reservation->v.nr_replicas = opts.data_replicas;
- } else {
- struct bkey_i_extent *e;
- struct bch_devs_list devs_have;
- struct write_point *wp;
-
- devs_have.nr = 0;
-
- bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
- e = bkey_extent_init(new.k);
- e->k.p = iter->pos;
-
- ret = bch2_alloc_sectors_start_trans(trans,
- opts.foreground_target,
- false,
- write_point,
- &devs_have,
- opts.data_replicas,
- opts.data_replicas,
- BCH_WATERMARK_normal, 0, &cl, &wp);
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = bch_err_throw(c, transaction_restart_nested);
- if (ret)
- goto err;
-
- sectors = min_t(u64, sectors, wp->sectors_free);
- sectors_allocated = sectors;
-
- bch2_key_resize(&e->k, sectors);
-
- bch2_open_bucket_get(c, wp, &open_buckets);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
- bch2_alloc_sectors_done(c, wp);
-
- extent_for_each_ptr(extent_i_to_s(e), ptr)
- ptr->unwritten = true;
- }
-
- ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
- 0, i_sectors_delta, true);
-err:
- if (!ret && sectors_allocated)
- bch2_increment_clock(c, sectors_allocated, WRITE);
- if (should_print_err(ret)) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
- prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-err_noprint:
- bch2_open_buckets_put(c, &open_buckets);
- bch2_disk_reservation_put(c, &disk_res);
- bch2_bkey_buf_exit(&new, c);
- bch2_bkey_buf_exit(&old, c);
-
- if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock_long(trans);
- bch2_wait_on_allocator(c, &cl);
- }
-
- return ret;
-}
-
-/* For fsck */
-int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
-{
- u32 restart_count = trans->restart_count;
- struct bch_fs *c = trans->c;
- struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
- unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct bkey_i delete;
-
- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
- start, end, 0, k,
- &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- bkey_init(&delete.k);
- delete.k.p = iter.pos;
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete);
-
- bch2_extent_trim_atomic(trans, &iter, &delete) ?:
- bch2_trans_update(trans, &iter, &delete, 0);
- }));
-
- bch2_disk_reservation_put(c, &disk_res);
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- subvol_inum inum, u64 end,
- s64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct bpos end_pos = POS(inum.inum, end);
- struct bkey_s_c k;
- int ret = 0, ret2 = 0;
- u32 snapshot;
-
- while (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete;
-
- if (ret)
- ret2 = ret;
-
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(trans, iter, snapshot);
-
- /*
- * peek_max() doesn't have ideal semantics for extents:
- */
- k = bch2_btree_iter_peek_max(trans, iter, end_pos);
- if (!k.k)
- break;
-
- ret = bkey_err(k);
- if (ret)
- continue;
-
- bkey_init(&delete.k);
- delete.k.p = iter->pos;
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end_pos, &delete);
-
- ret = bch2_extent_update(trans, inum, iter, &delete,
- &disk_res, 0, i_sectors_delta, false);
- bch2_disk_reservation_put(c, &disk_res);
- }
-
- return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
- s64 *i_sectors_delta)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, start),
- BTREE_ITER_intent);
-
- ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
-
- return ret;
-}
-
-/* truncate: */
-
-void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
-
- prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
- prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
- prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
-}
-
-static int truncate_set_isize(struct btree_trans *trans,
- subvol_inum inum,
- u64 new_i_size,
- bool warn)
-{
- struct btree_iter iter = {};
- struct bch_inode_unpacked inode_u;
- int ret;
-
- ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
- (inode_u.bi_size = new_i_size, 0) ?:
- bch2_inode_write(trans, &iter, &inode_u);
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
- struct bkey_i *op_k,
- u64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter fpunch_iter;
- struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
- subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
- u64 new_i_size = le64_to_cpu(op->v.new_i_size);
- bool warn_errors = i_sectors_delta != NULL;
- int ret;
-
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
- POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
- BTREE_ITER_intent);
- ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
- bch2_trans_iter_exit(trans, &fpunch_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
-err:
- if (warn_errors)
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
-{
- return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
-}
-
-int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
-{
- struct bkey_i_logged_op_truncate op;
-
- bkey_logged_op_truncate_init(&op.k_i);
- op.v.subvol = cpu_to_le32(inum.subvol);
- op.v.inum = cpu_to_le64(inum.inum);
- op.v.new_i_size = cpu_to_le64(new_i_size);
-
- /*
- * Logged ops aren't atomic w.r.t. snapshot creation: creating a
- * snapshot while they're in progress, then crashing, will result in the
- * resume only proceeding in one of the snapshots
- */
- down_read(&c->snapshot_create_lock);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = bch2_logged_op_start(trans, &op.k_i);
- if (ret)
- goto out;
- ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
- ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
- bch2_trans_put(trans);
- up_read(&c->snapshot_create_lock);
-
- return ret;
-}
-
-/* finsert/fcollapse: */
-
-void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
-
- prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
- prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
- prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset));
- prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset));
-}
-
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
- u64 offset, s64 len, bool warn)
-{
- struct btree_iter iter;
- struct bch_inode_unpacked inode_u;
- int ret;
-
- offset <<= 9;
- len <<= 9;
-
- ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
- if (ret)
- return ret;
-
- if (len > 0) {
- if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
- ret = -EFBIG;
- goto err;
- }
-
- if (offset >= inode_u.bi_size) {
- ret = -EINVAL;
- goto err;
- }
- }
-
- inode_u.bi_size += len;
- inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
-
- ret = bch2_inode_write(trans, &iter, &inode_u);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
- struct bkey_i *op_k,
- u64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
- subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
- struct bch_io_opts opts;
- u64 dst_offset = le64_to_cpu(op->v.dst_offset);
- u64 src_offset = le64_to_cpu(op->v.src_offset);
- s64 shift = dst_offset - src_offset;
- u64 len = abs(shift);
- u64 pos = le64_to_cpu(op->v.pos);
- bool insert = shift > 0;
- u32 snapshot;
- bool warn_errors = i_sectors_delta != NULL;
- int ret = 0;
-
- ret = bch2_inum_opts_get(trans, inum, &opts);
- if (ret)
- return ret;
-
- /*
- * check for missing subvolume before fpunch, as in resume we don't want
- * it to be a fatal error
- */
- ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
- if (ret)
- return ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, 0),
- BTREE_ITER_intent);
-
- switch (op->v.state) {
-case LOGGED_OP_FINSERT_start:
- op->v.state = LOGGED_OP_FINSERT_shift_extents;
-
- if (insert) {
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
- bch2_logged_op_update(trans, &op->k_i));
- if (ret)
- goto err;
- } else {
- bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
-
- ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_logged_op_update(trans, &op->k_i));
- }
-
- fallthrough;
-case LOGGED_OP_FINSERT_shift_extents:
- while (1) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete, *copy;
- struct bkey_s_c k;
- struct bpos src_pos = POS(inum.inum, src_offset);
-
- bch2_trans_begin(trans);
-
- ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
- warn_errors);
- if (ret)
- goto btree_err;
-
- bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
-
- k = insert
- ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
- : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
- if ((ret = bkey_err(k)))
- goto btree_err;
-
- if (!k.k ||
- k.k->p.inode != inum.inum ||
- bkey_le(k.k->p, POS(inum.inum, src_offset)))
- break;
-
- copy = bch2_bkey_make_mut_noupdate(trans, k);
- if ((ret = PTR_ERR_OR_ZERO(copy)))
- goto btree_err;
-
- if (insert &&
- bkey_lt(bkey_start_pos(k.k), src_pos)) {
- bch2_cut_front(src_pos, copy);
-
- /* Splitting compressed extent? */
- bch2_disk_reservation_add(c, &disk_res,
- copy->k.size *
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
- BCH_DISK_RESERVATION_NOFAIL);
- }
-
- bkey_init(&delete.k);
- delete.k.p = copy->k.p;
- delete.k.p.snapshot = snapshot;
- delete.k.size = copy->k.size;
-
- copy->k.p.offset += shift;
- copy->k.p.snapshot = snapshot;
-
- op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
-
- ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
- bch2_logged_op_update(trans, &op->k_i) ?:
- bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
-btree_err:
- bch2_disk_reservation_put(c, &disk_res);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
-
- pos = le64_to_cpu(op->v.pos);
- }
-
- op->v.state = LOGGED_OP_FINSERT_finish;
-
- if (!insert) {
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
- bch2_logged_op_update(trans, &op->k_i));
- } else {
- /* We need an inode update to update bi_journal_seq for fsync: */
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
- bch2_logged_op_update(trans, &op->k_i));
- }
-
- break;
-case LOGGED_OP_FINSERT_finish:
- break;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- if (warn_errors)
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
-{
- return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
-}
-
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
- u64 offset, u64 len, bool insert,
- s64 *i_sectors_delta)
-{
- struct bkey_i_logged_op_finsert op;
- s64 shift = insert ? len : -len;
-
- bkey_logged_op_finsert_init(&op.k_i);
- op.v.subvol = cpu_to_le32(inum.subvol);
- op.v.inum = cpu_to_le64(inum.inum);
- op.v.dst_offset = cpu_to_le64(offset + shift);
- op.v.src_offset = cpu_to_le64(offset);
- op.v.pos = cpu_to_le64(insert ? U64_MAX : offset);
-
- /*
- * Logged ops aren't atomic w.r.t. snapshot creation: creating a
- * snapshot while they're in progress, then crashing, will result in the
- * resume only proceeding in one of the snapshots
- */
- down_read(&c->snapshot_create_lock);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = bch2_logged_op_start(trans, &op.k_i);
- if (ret)
- goto out;
- ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
- ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
-out:
- bch2_trans_put(trans);
- up_read(&c->snapshot_create_lock);
-
- return ret;
-}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
deleted file mode 100644
index b93e4d4b3c0c..000000000000
--- a/fs/bcachefs/io_misc.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_MISC_H
-#define _BCACHEFS_IO_MISC_H
-
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
- u64, struct bch_io_opts, s64 *,
- struct write_point_specifier);
-
-int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \
- .val_to_text = bch2_logged_op_truncate_to_text, \
- .min_val_size = 24, \
-})
-
-int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
-
-int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
-
-void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \
- .val_to_text = bch2_logged_op_finsert_to_text, \
- .min_val_size = 24, \
-})
-
-int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
-
-int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
-
-#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
deleted file mode 100644
index e0874ad9a6cf..000000000000
--- a/fs/bcachefs/io_read.c
+++ /dev/null
@@ -1,1543 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "data_update.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "io_read.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "trace.h"
-
-#include <linux/moduleparam.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_read_corrupt_ratio;
-module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(read_corrupt_ratio, "");
-#endif
-
-static bool bch2_poison_extents_on_checksum_error;
-module_param_named(poison_extents_on_checksum_error,
- bch2_poison_extents_on_checksum_error, bool, 0644);
-MODULE_PARM_DESC(poison_extents_on_checksum_error,
- "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
- const struct bch_devs_mask *devs;
- unsigned d, nr = 0, total = 0;
- u64 now = local_clock(), last;
- s64 congested;
- struct bch_dev *ca;
-
- if (!target)
- return false;
-
- guard(rcu)();
- devs = bch2_target_to_mask(c, target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
- ca = rcu_dereference(c->devs[d]);
- if (!ca)
- continue;
-
- congested = atomic_read(&ca->congested);
- last = READ_ONCE(ca->congested_last);
- if (time_after64(now, last))
- congested -= (now - last) >> 12;
-
- total += max(congested, 0LL);
- nr++;
- }
-
- return get_random_u32_below(nr * CONGESTED_MAX) < total;
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
- return false;
-}
-
-#endif
-
-/* Cache promotion on read */
-
-static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
- .automatic_shrinking = true,
-};
-
-static inline bool have_io_error(struct bch_io_failures *failed)
-{
- return failed && failed->nr;
-}
-
-static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
-{
- EBUG_ON(rbio->split);
-
- return rbio->data_update
- ? container_of(rbio, struct data_update, rbio)
- : NULL;
-}
-
-static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
-{
- struct data_update *u = rbio_data_update(orig);
- if (!u)
- return false;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
- unsigned i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr->dev == dev &&
- u->data_opts.rewrite_ptrs & BIT(i))
- return true;
- i++;
- }
-
- return false;
-}
-
-static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
- struct bpos pos,
- struct bch_io_opts opts,
- unsigned flags,
- struct bch_io_failures *failed)
-{
- if (!have_io_error(failed)) {
- BUG_ON(!opts.promote_target);
-
- if (!(flags & BCH_READ_may_promote))
- return bch_err_throw(c, nopromote_may_not);
-
- if (bch2_bkey_has_target(c, k, opts.promote_target))
- return bch_err_throw(c, nopromote_already_promoted);
-
- if (bkey_extent_is_unwritten(k))
- return bch_err_throw(c, nopromote_unwritten);
-
- if (bch2_target_congested(c, opts.promote_target))
- return bch_err_throw(c, nopromote_congested);
- }
-
- if (rhashtable_lookup_fast(&c->promote_table, &pos,
- bch_promote_params))
- return bch_err_throw(c, nopromote_in_flight);
-
- return 0;
-}
-
-static noinline void promote_free(struct bch_read_bio *rbio)
-{
- struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
- struct bch_fs *c = rbio->c;
-
- int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
-
- async_object_list_del(c, promote, op->list_idx);
- async_object_list_del(c, rbio, rbio->list_idx);
-
- bch2_data_update_exit(&op->write);
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
- kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
- struct promote_op *op = container_of(wop, struct promote_op, write.op);
- struct bch_fs *c = op->write.rbio.c;
-
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
- promote_free(&op->write.rbio);
-}
-
-static void promote_start_work(struct work_struct *work)
-{
- struct promote_op *op = container_of(work, struct promote_op, work);
-
- bch2_data_update_read_done(&op->write);
-}
-
-static noinline void promote_start(struct bch_read_bio *rbio)
-{
- struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
-
- trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
-
- INIT_WORK(&op->work, promote_start_work);
- queue_work(rbio->c->write_ref_wq, &op->work);
-}
-
-static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct bpos pos,
- struct extent_ptr_decoded *pick,
- unsigned sectors,
- struct bch_read_bio *orig,
- struct bch_io_failures *failed)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
-
- if (!have_io_error(failed)) {
- update_opts.target = orig->opts.promote_target;
- update_opts.extra_replicas = 1;
- update_opts.write_flags |= BCH_WRITE_cached;
- update_opts.write_flags |= BCH_WRITE_only_specified_devs;
- } else {
- update_opts.target = orig->opts.foreground_target;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned ptr_bit = 1;
- bkey_for_each_ptr(ptrs, ptr) {
- if (bch2_dev_io_failures(failed, ptr->dev) &&
- !ptr_being_rewritten(orig, ptr->dev))
- update_opts.rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
-
- if (!update_opts.rewrite_ptrs)
- return NULL;
- }
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
- return ERR_PTR(-BCH_ERR_nopromote_no_writes);
-
- struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
- if (!op) {
- ret = bch_err_throw(c, nopromote_enomem);
- goto err_put;
- }
-
- op->start_time = local_clock();
- op->pos = pos;
-
- if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
- bch_promote_params)) {
- ret = bch_err_throw(c, nopromote_in_flight);
- goto err;
- }
-
- ret = async_object_list_add(c, promote, op, &op->list_idx);
- if (ret < 0)
- goto err_remove_hash;
-
- ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
- writepoint_hashed((unsigned long) current),
- &orig->opts,
- update_opts,
- btree_id, k);
- op->write.type = BCH_DATA_UPDATE_promote;
- /*
- * possible errors: -BCH_ERR_nocow_lock_blocked,
- * -BCH_ERR_ENOSPC_disk_reservation:
- */
- if (ret)
- goto err_remove_list;
-
- rbio_init_fragment(&op->write.rbio.bio, orig);
- op->write.rbio.bounce = true;
- op->write.rbio.promote = true;
- op->write.op.end_io = promote_done;
-
- return &op->write.rbio;
-err_remove_list:
- async_object_list_del(c, promote, op->list_idx);
-err_remove_hash:
- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params));
-err:
- bio_free_pages(&op->write.op.wbio.bio);
- /* We may have added to the rhashtable and thus need rcu freeing: */
- kfree_rcu(op, rcu);
-err_put:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
- return ERR_PTR(ret);
-}
-
-noinline
-static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- unsigned flags,
- struct bch_read_bio *orig,
- bool *bounce,
- bool *read_full,
- struct bch_io_failures *failed)
-{
- /*
- * We're in the retry path, but we don't know what to repair yet, and we
- * don't want to do a promote here:
- */
- if (failed && !failed->nr)
- return NULL;
-
- struct bch_fs *c = trans->c;
- /*
- * if failed != NULL we're not actually doing a promote, we're
- * recovering from an io/checksum error
- */
- bool promote_full = (have_io_error(failed) ||
- *read_full ||
- READ_ONCE(c->opts.promote_whole_extents));
- /* data might have to be decompressed in the write path: */
- unsigned sectors = promote_full
- ? max(pick->crc.compressed_size, pick->crc.live_size)
- : bvec_iter_sectors(iter);
- struct bpos pos = promote_full
- ? bkey_start_pos(k.k)
- : POS(k.k->p.inode, iter.bi_sector);
- int ret;
-
- ret = should_promote(c, k, pos, orig->opts, flags, failed);
- if (ret)
- goto nopromote;
-
- struct bch_read_bio *promote =
- __promote_alloc(trans,
- k.k->type == KEY_TYPE_reflink_v
- ? BTREE_ID_reflink
- : BTREE_ID_extents,
- k, pos, pick, sectors, orig, failed);
- if (!promote)
- return NULL;
-
- ret = PTR_ERR_OR_ZERO(promote);
- if (ret)
- goto nopromote;
-
- *bounce = true;
- *read_full = promote_full;
-
- if (have_io_error(failed))
- orig->self_healing = true;
-
- return promote;
-nopromote:
- trace_io_read_nopromote(c, ret);
- return NULL;
-}
-
-void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op)
-{
- if (!op->write.read_done) {
- prt_printf(out, "parent read: %px\n", op->write.rbio.parent);
- printbuf_indent_add(out, 2);
- bch2_read_bio_to_text(out, op->write.rbio.parent);
- printbuf_indent_sub(out, 2);
- }
-
- bch2_data_update_to_text(out, &op->write);
-}
-
-/* Read */
-
-static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- struct bch_read_bio *rbio, struct bpos read_pos)
-{
- int ret = lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, out,
- (subvol_inum) { rbio->subvol, read_pos.inode },
- read_pos.offset << 9));
- if (ret)
- return ret;
-
- if (rbio->data_update)
- prt_str(out, "(internal move) ");
-
- return 0;
-}
-
-static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
- struct bch_read_bio *rbio, struct bpos read_pos)
-{
- bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
-}
-
-enum rbio_context {
- RBIO_CONTEXT_NULL,
- RBIO_CONTEXT_HIGHPRI,
- RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
- return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
- enum rbio_context context,
- struct workqueue_struct *wq)
-{
- if (context <= rbio->context) {
- fn(&rbio->work);
- } else {
- rbio->work.func = fn;
- rbio->context = context;
- queue_work(wq, &rbio->work);
- }
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
- BUG_ON(rbio->bounce && !rbio->split);
-
- if (rbio->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
- }
-
- if (rbio->split) {
- struct bch_read_bio *parent = rbio->parent;
-
- if (unlikely(rbio->promote)) {
- if (!rbio->bio.bi_status)
- promote_start(rbio);
- else
- promote_free(rbio);
- } else {
- async_object_list_del(rbio->c, rbio, rbio->list_idx);
-
- if (rbio->bounce)
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
- bio_put(&rbio->bio);
- }
-
- rbio = parent;
- }
-
- return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
- if (rbio->start_time)
- bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
- rbio->start_time);
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- if (rbio->list_idx)
- async_object_list_del(rbio->c, rbio, rbio->list_idx);
-#endif
- bio_endio(&rbio->bio);
-}
-
-static void get_rbio_extent(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- struct bkey_buf *sk)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = lockrestart_do(trans,
- bkey_err(k = bch2_bkey_get_iter(trans, &iter,
- rbio->data_btree, rbio->data_pos, 0)));
- if (ret)
- return;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr)
- if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
- bch2_bkey_buf_reassemble(sk, trans->c, k);
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
-}
-
-static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
- enum btree_id btree, struct bkey_s_c read_k)
-{
- if (!bch2_poison_extents_on_checksum_error)
- return 0;
-
- struct bch_fs *c = trans->c;
-
- struct data_update *u = rbio_data_update(rbio);
- if (u)
- read_k = bkey_i_to_s_c(u->k.k);
-
- u64 flags = bch2_bkey_extent_flags(read_k);
- if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_and_val_eq(k, read_k))
- goto out;
-
- struct bkey_i *new = bch2_trans_kmalloc(trans,
- bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
- ret = PTR_ERR_OR_ZERO(new) ?:
- (bkey_reassemble(new, k), 0) ?:
- bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
- bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-
- /*
- * Propagate key change back to data update path, in particular so it
- * knows the extent has been poisoned and it's safe to change the
- * checksum
- */
- if (u && !ret)
- bch2_bkey_buf_copy(&u->k, c, new);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter,
- struct bch_io_failures *failed,
- unsigned flags)
-{
- struct data_update *u = container_of(rbio, struct data_update, rbio);
-retry:
- bch2_trans_begin(trans);
-
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = lockrestart_do(trans,
- bkey_err(k = bch2_bkey_get_iter(trans, &iter,
- u->btree_id, bkey_start_pos(&u->k.k->k),
- 0)));
- if (ret)
- goto err;
-
- if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
- /* extent we wanted to read no longer exists: */
- rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
- goto err;
- }
-
- ret = __bch2_read_extent(trans, rbio, bvec_iter,
- bkey_start_pos(&u->k.k->k),
- u->btree_id,
- bkey_i_to_s_c(u->k.k),
- 0, failed, flags, -1);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- bch2_err_matches(ret, BCH_ERR_data_read_retry))
- goto retry;
-
- if (ret) {
- rbio->bio.bi_status = BLK_STS_IOERR;
- rbio->ret = ret;
- }
-
- BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
- return ret;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bvec_iter iter = rbio->bvec_iter;
- unsigned flags = rbio->flags;
- subvol_inum inum = {
- .subvol = rbio->subvol,
- .inum = rbio->read_pos.inode,
- };
- struct bch_io_failures failed = { .nr = 0 };
-
- struct btree_trans *trans = bch2_trans_get(c);
-
- struct bkey_buf sk;
- bch2_bkey_buf_init(&sk);
- bkey_init(&sk.k->k);
-
- trace_io_read_retry(&rbio->bio);
- this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
- bvec_iter_sectors(rbio->bvec_iter));
-
- get_rbio_extent(trans, rbio, &sk);
-
- if (!bkey_deleted(&sk.k->k) &&
- bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
- bch2_mark_io_failure(&failed, &rbio->pick,
- rbio->ret == -BCH_ERR_data_read_retry_csum_err);
-
- if (!rbio->split) {
- rbio->bio.bi_status = 0;
- rbio->ret = 0;
- }
-
- unsigned subvol = rbio->subvol;
- struct bpos read_pos = rbio->read_pos;
-
- rbio = bch2_rbio_free(rbio);
-
- flags |= BCH_READ_in_retry;
- flags &= ~BCH_READ_may_promote;
- flags &= ~BCH_READ_last_fragment;
- flags |= BCH_READ_must_clone;
-
- int ret = rbio->data_update
- ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
- : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
-
- if (ret) {
- rbio->ret = ret;
- rbio->bio.bi_status = BLK_STS_IOERR;
- }
-
- if (failed.nr || ret) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf,
- (subvol_inum) { subvol, read_pos.inode },
- read_pos.offset << 9));
- if (rbio->data_update)
- prt_str(&buf, "(internal move) ");
-
- prt_str(&buf, "data read error, ");
- if (!ret) {
- prt_str(&buf, "successful retry");
- if (rbio->self_healing)
- prt_str(&buf, ", self healing");
- } else
- prt_str(&buf, bch2_err_str(ret));
- prt_newline(&buf);
-
-
- if (!bkey_deleted(&sk.k->k)) {
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
- prt_newline(&buf);
- }
-
- bch2_io_failures_to_text(&buf, c, &failed);
-
- bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-
- bch2_rbio_done(rbio);
- bch2_bkey_buf_exit(&sk, c);
- bch2_trans_put(trans);
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio,
- int ret, blk_status_t blk_error)
-{
- BUG_ON(ret >= 0);
-
- rbio->ret = ret;
- rbio->bio.bi_status = blk_error;
-
- bch2_rbio_parent(rbio)->saw_error = true;
-
- if (rbio->flags & BCH_READ_in_retry)
- return;
-
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
- bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- } else {
- rbio = bch2_rbio_free(rbio);
-
- rbio->ret = ret;
- rbio->bio.bi_status = blk_error;
-
- bch2_rbio_done(rbio);
- }
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
- struct bch_read_bio *rbio)
-{
- struct bch_fs *c = rbio->c;
- u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
- struct bch_extent_crc_unpacked new_crc;
- struct btree_iter iter;
- struct bkey_i *new;
- struct bkey_s_c k;
- int ret = 0;
-
- if (crc_is_compressed(rbio->pick.crc))
- return 0;
-
- k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_slots|BTREE_ITER_intent);
- if ((ret = bkey_err(k)))
- goto out;
-
- if (bversion_cmp(k.k->bversion, rbio->version) ||
- !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
- goto out;
-
- /* Extent was merged? */
- if (bkey_start_offset(k.k) < data_offset ||
- k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
- goto out;
-
- if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
- rbio->pick.crc, NULL, &new_crc,
- bkey_start_offset(k.k) - data_offset, k.k->size,
- rbio->pick.crc.csum_type)) {
- bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
- ret = 0;
- goto out;
- }
-
- /*
- * going to be temporarily appending another checksum entry:
- */
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
- sizeof(struct bch_extent_crc128));
- if ((ret = PTR_ERR_OR_ZERO(new)))
- goto out;
-
- bkey_reassemble(new, k);
-
- if (!bch2_bkey_narrow_crcs(new, new_crc))
- goto out;
-
- ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
- bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
-}
-
-static void bch2_read_decompress_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "decompression error");
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
-static void bch2_read_decrypt_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "decrypt error");
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct bch_read_bio *parent = bch2_rbio_parent(rbio);
- struct bio *src = &rbio->bio;
- struct bio *dst = &parent->bio;
- struct bvec_iter dst_iter = rbio->bvec_iter;
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
- unsigned nofs_flags;
- struct bch_csum csum;
- int ret;
-
- nofs_flags = memalloc_nofs_save();
-
- /* Reset iterator for checksumming and copying bounced data: */
- if (rbio->bounce) {
- src->bi_iter.bi_size = crc.compressed_size << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
- } else {
- src->bi_iter = rbio->bvec_iter;
- }
-
- bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
-
- csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
-
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
- rbio->flags |= BCH_READ_must_bounce;
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
- BLK_STS_IOERR);
- goto out;
- }
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
- if (!csum_good)
- goto csum_err;
-
- /*
- * XXX
- * We need to rework the narrow_crcs path to deliver the read completion
- * first, and then punt to a different workqueue, otherwise we're
- * holding up reads while doing btree updates which is bad for memory
- * reclaim.
- */
- if (unlikely(rbio->narrow_crcs))
- bch2_rbio_narrow_crcs(rbio);
-
- if (likely(!parent->data_update)) {
- /* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->offset_into_extent;
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
-
- if (crc_is_compressed(crc)) {
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
- !c->opts.no_data_io)
- goto decompression_err;
- } else {
- /* don't need to decrypt the entire bio: */
- nonce = nonce_add(nonce, crc.offset << 9);
- bio_advance(src, crc.offset << 9);
-
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
-
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- if (rbio->bounce) {
- struct bvec_iter src_iter = src->bi_iter;
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
- }
- } else {
- if (rbio->split)
- rbio->parent->pick = rbio->pick;
-
- if (rbio->bounce) {
- struct bvec_iter src_iter = src->bi_iter;
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
- }
-
- if (rbio->promote) {
- /*
- * Re encrypt data we decrypted, so it's consistent with
- * rbio->crc:
- */
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
- }
-
- if (likely(!(rbio->flags & BCH_READ_in_retry))) {
- rbio = bch2_rbio_free(rbio);
- bch2_rbio_done(rbio);
- }
-out:
- memalloc_nofs_restore(nofs_flags);
- return;
-csum_err:
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
- goto out;
-decompression_err:
- bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- goto out;
-decrypt_err:
- bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
- struct bch_read_bio *rbio =
- container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct workqueue_struct *wq = NULL;
- enum rbio_context context = RBIO_CONTEXT_NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- rbio->submit_time, !bio->bi_status);
-
- if (!rbio->split)
- rbio->bio.bi_end_io = rbio->end_io;
-
- if (unlikely(bio->bi_status)) {
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
- return;
- }
-
- if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
- (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
- trace_and_count(c, io_read_reuse_race, &rbio->bio);
-
- if (rbio->flags & BCH_READ_retry_if_stale)
- bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
- else
- bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
- return;
- }
-
- if (rbio->narrow_crcs ||
- rbio->promote ||
- crc_is_compressed(rbio->pick.crc) ||
- bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
- context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
- else if (rbio->pick.crc.csum_type)
- context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
-
- bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
- struct bch_dev *ca,
- struct bkey_s_c k,
- struct bch_extent_ptr ptr)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(ca, &ptr),
- BTREE_ITER_cached);
-
- int gen = bucket_gen_get(ca, iter.pos.offset);
- if (gen >= 0) {
- prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
- printbuf_indent_add(&buf, 2);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- prt_printf(&buf, "memory gen: %u", gen);
-
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
- if (!ret) {
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- }
- } else {
- prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
- iter.pos.inode, iter.pos.offset);
- printbuf_indent_add(&buf, 2);
-
- prt_printf(&buf, "first bucket %u nbuckets %llu\n",
- ca->mi.first_bucket, ca->mi.nbuckets);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- }
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
-
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bpos read_pos,
- enum btree_id data_btree, struct bkey_s_c k,
- unsigned offset_into_extent,
- struct bch_io_failures *failed, unsigned flags, int dev)
-{
- struct bch_fs *c = trans->c;
- struct extent_ptr_decoded pick;
- struct bch_read_bio *rbio = NULL;
- bool bounce = false, read_full = false, narrow_crcs = false;
- struct bpos data_pos = bkey_start_pos(k.k);
- struct data_update *u = rbio_data_update(orig);
- int ret = 0;
-
- if (bkey_extent_is_inline_data(k.k)) {
- unsigned bytes = min_t(unsigned, iter.bi_size,
- bkey_inline_data_bytes(k.k));
-
- swap(iter.bi_size, bytes);
- memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
- swap(iter.bi_size, bytes);
- bio_advance_iter(&orig->bio, &iter, bytes);
- zero_fill_bio_iter(&orig->bio, iter);
- this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
- bvec_iter_sectors(iter));
- goto out_read_done;
- }
-
- if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
- !orig->data_update)
- return bch_err_throw(c, extent_poisoned);
-retry_pick:
- ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
-
- /* hole or reservation - just zero fill: */
- if (!ret)
- goto hole;
-
- if (unlikely(ret < 0)) {
- if (ret == -BCH_ERR_data_read_csum_err) {
- int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
- if (ret2) {
- ret = ret2;
- goto err;
- }
-
- trace_and_count(c, io_read_fail_and_poison, &orig->bio);
- }
-
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "%s\n ", bch2_err_str(ret));
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
- !c->chacha20_key_set) {
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = bch_err_throw(c, data_read_no_encryption_key);
- goto err;
- }
-
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
- BCH_DEV_READ_REF_io_read);
-
- /*
- * Stale dirty pointers are treated as IO errors, but @failed isn't
- * allocated unless we're in the retry path - so if we're not in the
- * retry path, don't check here, it'll be caught in bch2_read_endio()
- * and we'll end up in the retry path:
- */
- if ((flags & BCH_READ_in_retry) &&
- !pick.ptr.cached &&
- ca &&
- unlikely(dev_ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
- bch2_mark_io_failure(failed, &pick, false);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
- goto retry_pick;
- }
-
- if (likely(!u)) {
- if (!(flags & BCH_READ_last_fragment) ||
- bio_flagged(&orig->bio, BIO_CHAIN))
- flags |= BCH_READ_must_clone;
-
- narrow_crcs = !(flags & BCH_READ_in_retry) &&
- bch2_can_narrow_extent_crcs(k, pick.crc);
-
- if (narrow_crcs && (flags & BCH_READ_user_mapped))
- flags |= BCH_READ_must_bounce;
-
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
- if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_none &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
- (flags & BCH_READ_user_mapped)) ||
- (flags & BCH_READ_must_bounce)))) {
- read_full = true;
- bounce = true;
- }
- } else {
- /*
- * can happen if we retry, and the extent we were going to read
- * has been merged in the meantime:
- */
- if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
- if (ca)
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_io_read);
- rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
- goto out_read_done;
- }
-
- iter.bi_size = pick.crc.compressed_size << 9;
- read_full = true;
- }
-
- if (orig->opts.promote_target || have_io_error(failed))
- rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
- &bounce, &read_full, failed);
-
- if (!read_full) {
- EBUG_ON(crc_is_compressed(pick.crc));
- EBUG_ON(pick.crc.csum_type &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- bvec_iter_sectors(iter) != pick.crc.live_size ||
- pick.crc.offset ||
- offset_into_extent));
-
- data_pos.offset += offset_into_extent;
- pick.ptr.offset += pick.crc.offset +
- offset_into_extent;
- offset_into_extent = 0;
- pick.crc.compressed_size = bvec_iter_sectors(iter);
- pick.crc.uncompressed_size = bvec_iter_sectors(iter);
- pick.crc.offset = 0;
- pick.crc.live_size = bvec_iter_sectors(iter);
- }
-
- if (rbio) {
- /*
- * promote already allocated bounce rbio:
- * promote needs to allocate a bio big enough for uncompressing
- * data in the write path, but we're not going to use it all
- * here:
- */
- EBUG_ON(rbio->bio.bi_iter.bi_size <
- pick.crc.compressed_size << 9);
- rbio->bio.bi_iter.bi_size =
- pick.crc.compressed_size << 9;
- } else if (bounce) {
- unsigned sectors = pick.crc.compressed_size;
-
- rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- 0,
- GFP_NOFS,
- &c->bio_read_split),
- orig);
-
- bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
- rbio->bounce = true;
- } else if (flags & BCH_READ_must_clone) {
- /*
- * Have to clone if there were any splits, due to error
- * reporting issues (if a split errored, and retrying didn't
- * work, when it reports the error to its parent (us) we don't
- * know if the error was from our bio, and we should retry, or
- * from the whole bio, in which case we don't want to retry and
- * lose the error)
- */
- rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
- &c->bio_read_split),
- orig);
- rbio->bio.bi_iter = iter;
- } else {
- rbio = orig;
- rbio->bio.bi_iter = iter;
- EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
- }
-
- EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
- rbio->submit_time = local_clock();
- if (!rbio->split)
- rbio->end_io = orig->bio.bi_end_io;
- rbio->bvec_iter = iter;
- rbio->offset_into_extent= offset_into_extent;
- rbio->flags = flags;
- rbio->have_ioref = ca != NULL;
- rbio->narrow_crcs = narrow_crcs;
- rbio->ret = 0;
- rbio->context = 0;
- rbio->pick = pick;
- rbio->subvol = orig->subvol;
- rbio->read_pos = read_pos;
- rbio->data_btree = data_btree;
- rbio->data_pos = data_pos;
- rbio->version = k.k->bversion;
- INIT_WORK(&rbio->work, NULL);
-
- rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
- rbio->bio.bi_end_io = bch2_read_endio;
-
- async_object_list_add(c, rbio, rbio, &rbio->list_idx);
-
- if (rbio->bounce)
- trace_and_count(c, io_read_bounce, &rbio->bio);
-
- if (!u)
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
- else
- this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
- bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
- /*
- * If it's being moved internally, we don't want to flag it as a cache
- * hit:
- */
- if (ca && pick.ptr.cached && !u)
- bch2_bucket_io_time_reset(trans, pick.ptr.dev,
- PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
- if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
- bio_inc_remaining(&orig->bio);
- trace_and_count(c, io_read_split, &orig->bio);
- }
-
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- if (!(flags & BCH_READ_in_retry))
- bch2_trans_unlock(trans);
- else
- bch2_trans_unlock_long(trans);
-
- if (likely(!rbio->pick.do_ec_reconstruct)) {
- if (unlikely(!rbio->have_ioref)) {
- bch2_rbio_error(rbio,
- -BCH_ERR_data_read_retry_device_offline,
- BLK_STS_IOERR);
- goto out;
- }
-
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
- bio_sectors(&rbio->bio));
- bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
- if (unlikely(c->opts.no_data_io)) {
- if (likely(!(flags & BCH_READ_in_retry)))
- bio_endio(&rbio->bio);
- } else {
- if (likely(!(flags & BCH_READ_in_retry)))
- submit_bio(&rbio->bio);
- else
- submit_bio_wait(&rbio->bio);
- }
-
- /*
- * We just submitted IO which may block, we expect relock fail
- * events and shouldn't count them:
- */
- trans->notrace_relock_fail = true;
- } else {
- /* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio, k)) {
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
- BLK_STS_IOERR);
- goto out;
- }
-
- if (likely(!(flags & BCH_READ_in_retry)))
- bio_endio(&rbio->bio);
- }
-out:
- if (likely(!(flags & BCH_READ_in_retry))) {
- return 0;
- } else {
- bch2_trans_unlock(trans);
-
- int ret;
-
- rbio->context = RBIO_CONTEXT_UNBOUND;
- bch2_read_endio(&rbio->bio);
-
- ret = rbio->ret;
- rbio = bch2_rbio_free(rbio);
-
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
- bch2_mark_io_failure(failed, &pick,
- ret == -BCH_ERR_data_read_retry_csum_err);
-
- return ret;
- }
-
-err:
- if (flags & BCH_READ_in_retry)
- return ret;
-
- orig->bio.bi_status = BLK_STS_IOERR;
- orig->ret = ret;
- goto out_read_done;
-
-hole:
- this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
- bvec_iter_sectors(iter));
- /*
- * won't normally happen in the data update (bch2_move_extent()) path,
- * but if we retry and the extent we wanted to read no longer exists we
- * have to signal that:
- */
- if (u)
- orig->ret = bch_err_throw(c, data_read_key_overwritten);
-
- zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
- if ((flags & BCH_READ_last_fragment) &&
- !(flags & BCH_READ_in_retry))
- bch2_rbio_done(orig);
- return 0;
-}
-
-int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed,
- struct bkey_buf *prev_read,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_buf sk;
- struct bkey_s_c k;
- enum btree_id data_btree;
- int ret;
-
- EBUG_ON(rbio->data_update);
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(inum.inum, bvec_iter.bi_sector),
- BTREE_ITER_slots);
-
- while (1) {
- data_btree = BTREE_ID_extents;
-
- bch2_trans_begin(trans);
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
-
- bch2_btree_iter_set_pos(trans, &iter,
- POS(inum.inum, bvec_iter.bi_sector));
-
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- s64 offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- unsigned sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &sk);
- if (ret)
- goto err;
-
- k = bkey_i_to_s_c(sk.k);
-
- if (unlikely(flags & BCH_READ_in_retry)) {
- if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
- failed->nr = 0;
- bch2_bkey_buf_copy(prev_read, c, sk.k);
- }
-
- /*
- * With indirect extents, the amount of data to read is the min
- * of the original extent and the indirect extent:
- */
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
-
- unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
- swap(bvec_iter.bi_size, bytes);
-
- if (bvec_iter.bi_size == bytes)
- flags |= BCH_READ_last_fragment;
-
- ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
- data_btree, k,
- offset_into_extent, failed, flags, -1);
- swap(bvec_iter.bi_size, bytes);
-
- if (ret)
- goto err;
-
- if (flags & BCH_READ_last_fragment)
- break;
-
- bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-err:
- if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
- flags |= BCH_READ_must_bounce;
-
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- !bch2_err_matches(ret, BCH_ERR_data_read_retry))
- break;
- }
-
- if (unlikely(ret)) {
- if (ret != -BCH_ERR_extent_poisoned) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum,
- bvec_iter.bi_sector << 9));
- prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- rbio->bio.bi_status = BLK_STS_IOERR;
- rbio->ret = ret;
-
- if (!(flags & BCH_READ_in_retry))
- bch2_rbio_done(rbio);
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_bkey_buf_exit(&sk, c);
- return ret;
-}
-
-static const char * const bch2_read_bio_flags[] = {
-#define x(n) #n,
- BCH_READ_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
-{
- u64 now = local_clock();
- prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0);
- prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0);
-
- if (!rbio->split)
- prt_printf(out, "end_io:\t%ps\n", rbio->end_io);
- else
- prt_printf(out, "parent:\t%px\n", rbio->parent);
-
- prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io);
-
- prt_printf(out, "promote:\t%u\n", rbio->promote);
- prt_printf(out, "bounce:\t%u\n", rbio->bounce);
- prt_printf(out, "split:\t%u\n", rbio->split);
- prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref);
- prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs);
- prt_printf(out, "context:\t%u\n", rbio->context);
-
- int ret = READ_ONCE(rbio->ret);
- if (ret < 0)
- prt_printf(out, "ret:\t%s\n", bch2_err_str(ret));
- else
- prt_printf(out, "ret:\t%i\n", ret);
-
- prt_printf(out, "flags:\t");
- bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
- prt_newline(out);
-
- bch2_bio_to_text(out, &rbio->bio);
-}
-
-void bch2_fs_io_read_exit(struct bch_fs *c)
-{
- if (c->promote_table.tbl)
- rhashtable_destroy(&c->promote_table);
- bioset_exit(&c->bio_read_split);
- bioset_exit(&c->bio_read);
- mempool_exit(&c->bio_bounce_pages);
-}
-
-int bch2_fs_io_read_init(struct bch_fs *c)
-{
- if (mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->opts.encoded_extent_max) /
- PAGE_SIZE, 0))
- return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
-
- if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS))
- return bch_err_throw(c, ENOMEM_bio_read_init);
-
- if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS))
- return bch_err_throw(c, ENOMEM_bio_read_split_init);
-
- if (rhashtable_init(&c->promote_table, &bch_promote_params))
- return bch_err_throw(c, ENOMEM_promote_table_init);
-
- return 0;
-}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
deleted file mode 100644
index 9c5ddbf861b3..000000000000
--- a/fs/bcachefs/io_read.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_READ_H
-#define _BCACHEFS_IO_READ_H
-
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "extents_types.h"
-#include "reflink.h"
-
-struct bch_read_bio {
- struct bch_fs *c;
- u64 start_time;
- u64 submit_time;
-
- /*
- * Reads will often have to be split, and if the extent being read from
- * was checksummed or compressed we'll also have to allocate bounce
- * buffers and copy the data back into the original bio.
- *
- * If we didn't have to split, we have to save and restore the original
- * bi_end_io - @split below indicates which:
- */
- union {
- struct bch_read_bio *parent;
- bio_end_io_t *end_io;
- };
-
- /*
- * Saved copy of bio->bi_iter, from submission time - allows us to
- * resubmit on IO error, and also to copy data back to the original bio
- * when we're bouncing:
- */
- struct bvec_iter bvec_iter;
-
- unsigned offset_into_extent;
-
- u16 flags;
- union {
- struct {
- u16 data_update:1,
- promote:1,
- bounce:1,
- split:1,
- have_ioref:1,
- narrow_crcs:1,
- saw_error:1,
- self_healing:1,
- context:2;
- };
- u16 _state;
- };
- s16 ret;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- unsigned list_idx;
-#endif
-
- struct extent_ptr_decoded pick;
-
- /*
- * pos we read from - different from data_pos for indirect extents:
- */
- u32 subvol;
- struct bpos read_pos;
-
- /*
- * start pos of data we read (may not be pos of data we want) - for
- * promote, narrow extents paths:
- */
- enum btree_id data_btree;
- struct bpos data_pos;
- struct bversion version;
-
- struct bch_io_opts opts;
-
- struct work_struct work;
-
- struct bio bio;
-};
-
-#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio)
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
- enum btree_id *data_btree,
- s64 *offset_into_extent,
- struct bkey_buf *extent)
-{
- if (extent->k->k.type != KEY_TYPE_reflink_p)
- return 0;
-
- *data_btree = BTREE_ID_reflink;
-
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
- offset_into_extent,
- bkey_i_to_s_c_reflink_p(extent->k),
- true, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
- return bch_err_throw(c, missing_indirect_extent);
- }
-
- bch2_bkey_buf_reassemble(extent, c, k);
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-#define BCH_READ_FLAGS() \
- x(retry_if_stale) \
- x(may_promote) \
- x(user_mapped) \
- x(last_fragment) \
- x(must_bounce) \
- x(must_clone) \
- x(in_retry)
-
-enum __bch_read_flags {
-#define x(n) __BCH_READ_##n,
- BCH_READ_FLAGS()
-#undef x
-};
-
-enum bch_read_flags {
-#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
- BCH_READ_FLAGS()
-#undef x
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
- struct bvec_iter, struct bpos, enum btree_id,
- struct bkey_s_c, unsigned,
- struct bch_io_failures *, unsigned, int);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
- struct bch_read_bio *rbio, struct bpos read_pos,
- enum btree_id data_btree, struct bkey_s_c k,
- unsigned offset_into_extent, unsigned flags)
-{
- int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
- data_btree, k, offset_into_extent, NULL, flags, -1);
- /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
- WARN(ret, "unhandled error from __bch2_read_extent()");
-}
-
-int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum,
- struct bch_io_failures *, struct bkey_buf *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum)
-{
- BUG_ON(rbio->_state);
-
- rbio->subvol = inum.subvol;
-
- bch2_trans_run(c,
- __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
- BCH_READ_retry_if_stale|
- BCH_READ_may_promote|
- BCH_READ_user_mapped));
-}
-
-static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
- struct bch_read_bio *orig)
-{
- struct bch_read_bio *rbio = to_rbio(bio);
-
- rbio->c = orig->c;
- rbio->_state = 0;
- rbio->flags = 0;
- rbio->ret = 0;
- rbio->split = true;
- rbio->parent = orig;
- rbio->opts = orig->opts;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- rbio->list_idx = 0;
-#endif
- return rbio;
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
- struct bch_fs *c,
- struct bch_io_opts opts,
- bio_end_io_t end_io)
-{
- struct bch_read_bio *rbio = to_rbio(bio);
-
- rbio->start_time = local_clock();
- rbio->c = c;
- rbio->_state = 0;
- rbio->flags = 0;
- rbio->ret = 0;
- rbio->opts = opts;
- rbio->bio.bi_end_io = end_io;
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- rbio->list_idx = 0;
-#endif
- return rbio;
-}
-
-struct promote_op;
-void bch2_promote_op_to_text(struct printbuf *, struct promote_op *);
-void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *);
-
-void bch2_fs_io_read_exit(struct bch_fs *);
-int bch2_fs_io_read_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
deleted file mode 100644
index 88b1eec8eff3..000000000000
--- a/fs/bcachefs/io_write.c
+++ /dev/null
@@ -1,1780 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_write_corrupt_ratio;
-module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
-MODULE_PARM_DESC(write_corrupt_ratio, "");
-#endif
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
- u64 now, int rw)
-{
- u64 latency_capable =
- ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
- /* ideally we'd be taking into account the device's variance here: */
- u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
- s64 latency_over = io_latency - latency_threshold;
-
- if (latency_threshold && latency_over > 0) {
- /*
- * bump up congested by approximately latency_over * 4 /
- * latency_threshold - we don't need much accuracy here so don't
- * bother with the divide:
- */
- if (atomic_read(&ca->congested) < CONGESTED_MAX)
- atomic_add(latency_over >>
- max_t(int, ilog2(latency_threshold) - 2, 0),
- &ca->congested);
-
- ca->congested_last = now;
- } else if (atomic_read(&ca->congested) > 0) {
- atomic_dec(&ca->congested);
- }
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
- atomic64_t *latency = &ca->cur_latency[rw];
- u64 now = local_clock();
- u64 io_latency = time_after64(now, submit_time)
- ? now - submit_time
- : 0;
- u64 old, new;
-
- old = atomic64_read(latency);
- do {
- /*
- * If the io latency was reasonably close to the current
- * latency, skip doing the update and atomic operation - most of
- * the time:
- */
- if (abs((int) (old - io_latency)) < (old >> 1) &&
- now & ~(~0U << 5))
- break;
-
- new = ewma_add(old, io_latency, 5);
- } while (!atomic64_try_cmpxchg(latency, &old, new));
-
- bch2_congested_acct(ca, io_latency, now, rw);
-
- __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
- struct bvec_iter_all iter;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, iter)
- if (bv->bv_page != ZERO_PAGE(0))
- mempool_free(bv->bv_page, &c->bio_bounce_pages);
- bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
- struct page *page;
-
- if (likely(!*using_mempool)) {
- page = alloc_page(GFP_NOFS);
- if (unlikely(!page)) {
- mutex_lock(&c->bio_bounce_pages_lock);
- *using_mempool = true;
- goto pool_alloc;
-
- }
- } else {
-pool_alloc:
- page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
- }
-
- return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t size)
-{
- bool using_mempool = false;
-
- while (size) {
- struct page *page = __bio_alloc_page_pool(c, &using_mempool);
- unsigned len = min_t(size_t, PAGE_SIZE, size);
-
- BUG_ON(!bio_add_page(bio, page, len, 0));
- size -= len;
- }
-
- if (using_mempool)
- mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *new,
- bool *usage_increasing,
- s64 *i_sectors_delta,
- s64 *disk_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c old;
- unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
- bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
- int ret = 0;
-
- *usage_increasing = false;
- *i_sectors_delta = 0;
- *disk_sectors_delta = 0;
-
- bch2_trans_copy_iter(trans, &iter, extent_iter);
-
- for_each_btree_key_max_continue_norestart(trans, iter,
- new->k.p, BTREE_ITER_slots, old, ret) {
- s64 sectors = min(new->k.p.offset, old.k->p.offset) -
- max(bkey_start_offset(&new->k),
- bkey_start_offset(old.k));
-
- *i_sectors_delta += sectors *
- (bkey_extent_is_allocation(&new->k) -
- bkey_extent_is_allocation(old.k));
-
- *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
- *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
- ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
- : 0;
-
- if (!*usage_increasing &&
- (new->k.p.snapshot != old.k->p.snapshot ||
- new_replicas > bch2_bkey_replicas(c, old) ||
- (!new_compressed && bch2_bkey_sectors_compressed(old))))
- *usage_increasing = true;
-
- if (bkey_ge(old.k->p, new->k.p))
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- u64 new_i_size,
- s64 i_sectors_delta)
-{
- /*
- * Crazy performance optimization:
- * Every extent update needs to also update the inode: the inode trigger
- * will set bi->journal_seq to the journal sequence number of this
- * transaction - for fsync.
- *
- * But if that's the only reason we're updating the inode (we're not
- * updating bi_size or bi_sectors), then we don't need the inode update
- * to be journalled - if we crash, the bi_journal_seq update will be
- * lost, but that's fine.
- */
- unsigned inode_update_flags = BTREE_UPDATE_nojournal;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0,
- extent_iter->pos.inode,
- extent_iter->snapshot),
- BTREE_ITER_intent|
- BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (unlikely(ret))
- return ret;
-
- /*
- * varint_decode_fast(), in the inode .invalid method, reads up to 7
- * bytes past the end of the buffer:
- */
- struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
- ret = PTR_ERR_OR_ZERO(k_mut);
- if (unlikely(ret))
- goto err;
-
- bkey_reassemble(k_mut, k);
-
- if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
- k_mut = bch2_inode_to_v3(trans, k_mut);
- ret = PTR_ERR_OR_ZERO(k_mut);
- if (unlikely(ret))
- goto err;
- }
-
- struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);
-
- if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
- new_i_size > le64_to_cpu(inode->v.bi_size)) {
- inode->v.bi_size = cpu_to_le64(new_i_size);
- inode_update_flags = 0;
- }
-
- if (i_sectors_delta) {
- s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
- if (unlikely(bi_sectors + i_sectors_delta < 0)) {
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
- prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
- extent_iter->pos.inode, bi_sectors, i_sectors_delta);
-
- bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- if (i_sectors_delta < 0)
- i_sectors_delta = -bi_sectors;
- else
- i_sectors_delta = 0;
- }
-
- le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
- inode_update_flags = 0;
- }
-
- /*
- * extents, dirents and xattrs updates require that an inode update also
- * happens - to ensure that if a key exists in one of those btrees with
- * a given snapshot ID an inode is also present - so we may have to skip
- * the nojournal optimization:
- */
- if (inode->k.p.snapshot != iter.snapshot) {
- inode->k.p.snapshot = iter.snapshot;
- inode_update_flags = 0;
- }
-
- ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_internal_snapshot_node|
- inode_update_flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
- subvol_inum inum,
- struct btree_iter *iter,
- struct bkey_i *k,
- struct disk_reservation *disk_res,
- u64 new_i_size,
- s64 *i_sectors_delta_total,
- bool check_enospc)
-{
- struct bpos next_pos;
- bool usage_increasing;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- int ret;
-
- /*
- * This traverses us the iterator without changing iter->path->pos to
- * search_key() (which is pos + 1 for extents): we want there to be a
- * path already traversed at iter->pos because
- * bch2_trans_extent_update() will use it to attempt extent merging
- */
- ret = __bch2_btree_iter_traverse(trans, iter);
- if (ret)
- return ret;
-
- ret = bch2_extent_trim_atomic(trans, iter, k);
- if (ret)
- return ret;
-
- next_pos = k->k.p;
-
- ret = bch2_sum_sector_overwrites(trans, iter, k,
- &usage_increasing,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- return ret;
-
- if (disk_res &&
- disk_sectors_delta > (s64) disk_res->sectors) {
- ret = bch2_disk_reservation_add(trans->c, disk_res,
- disk_sectors_delta - disk_res->sectors,
- !check_enospc || !usage_increasing
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- return ret;
- }
-
- /*
- * Note:
- * We always have to do an inode update - even when i_size/i_sectors
- * aren't changing - for fsync to work properly; fsync relies on
- * inode->bi_journal_seq which is updated by the trigger code:
- */
- ret = bch2_extent_update_i_size_sectors(trans, iter,
- min(k->k.p.offset << 9, new_i_size),
- i_sectors_delta) ?:
- bch2_trans_update(trans, iter, k, 0) ?:
- bch2_trans_commit(trans, disk_res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc);
- if (unlikely(ret))
- return ret;
-
- if (i_sectors_delta_total)
- *i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(trans, iter, next_pos);
- return 0;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct bkey_buf sk;
- struct keylist *keys = &op->insert_keys;
- struct bkey_i *k = bch2_keylist_front(keys);
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- subvol_inum inum = {
- .subvol = op->subvol,
- .inum = k->k.p.inode,
- };
- int ret;
-
- BUG_ON(!inum.subvol);
-
- bch2_bkey_buf_init(&sk);
-
- do {
- bch2_trans_begin(trans);
-
- k = bch2_keylist_front(keys);
- bch2_bkey_buf_copy(&sk, c, k);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
- &sk.k->k.p.snapshot);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- bkey_start_pos(&sk.k->k),
- BTREE_ITER_slots|BTREE_ITER_intent);
-
- ret = bch2_extent_update(trans, inum, &iter, sk.k,
- &op->res,
- op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_check_enospc);
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (bkey_ge(iter.pos, k->k.p))
- bch2_keylist_pop_front(&op->insert_keys);
- else
- bch2_cut_front(iter.pos, k);
- } while (!bch2_keylist_empty(keys));
-
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
-
- return ret;
-}
-
-/* Writes */
-
-void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
-{
- struct printbuf buf = PRINTBUF;
-
- if (op->subvol) {
- bch2_inum_offset_err_msg(op->c, &buf,
- (subvol_inum) { op->subvol, op->pos.inode, },
- offset << 9);
- } else {
- struct bpos pos = op->pos;
- pos.offset = offset;
- bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
- }
-
- prt_str(&buf, "write error: ");
-
- va_list args;
- va_start(args, fmt);
- prt_vprintf(&buf, fmt, args);
- va_end(args);
-
- if (op->flags & BCH_WRITE_move) {
- struct data_update *u = container_of(op, struct data_update, op);
-
- prt_printf(&buf, "\n from internal move ");
- bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
- }
-
- bch_err_ratelimited(op->c, "%s", buf.buf);
- printbuf_exit(&buf);
-}
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
- enum bch_data_type type,
- const struct bkey_i *k,
- bool nocow)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
- struct bch_write_bio *n;
- unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE;
- unsigned ref_idx = type == BCH_DATA_btree
- ? BCH_DEV_READ_REF_btree_node_write
- : BCH_DEV_WRITE_REF_io_write;
-
- BUG_ON(c->opts.nochanges);
-
- const struct bch_extent_ptr *last = NULL;
- bkey_for_each_ptr(ptrs, ptr)
- last = ptr;
-
- bkey_for_each_ptr(ptrs, ptr) {
- /*
- * XXX: btree writes should be using io_ref[WRITE], but we
- * aren't retrying failed btree writes yet (due to device
- * removal/ro):
- */
- struct bch_dev *ca = nocow
- ? bch2_dev_have_ref(c, ptr->dev)
- : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
-
- if (ptr != last) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
-
- n->bio.bi_end_io = wbio->bio.bi_end_io;
- n->bio.bi_private = wbio->bio.bi_private;
- n->parent = wbio;
- n->split = true;
- n->bounce = false;
- n->put_bio = true;
- n->bio.bi_opf = wbio->bio.bi_opf;
- bio_inc_remaining(&wbio->bio);
- } else {
- n = wbio;
- n->split = false;
- }
-
- n->c = c;
- n->dev = ptr->dev;
- n->have_ioref = ca != NULL;
- n->nocow = nocow;
- n->submit_time = local_clock();
- n->inode_offset = bkey_start_offset(&k->k);
- if (nocow)
- n->nocow_bucket = PTR_BUCKET_NR(ca, ptr);
- n->bio.bi_iter.bi_sector = ptr->offset;
-
- if (likely(n->have_ioref)) {
- this_cpu_add(ca->io_done->sectors[WRITE][type],
- bio_sectors(&n->bio));
-
- bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
- if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
- bio_endio(&n->bio);
- continue;
- }
-
- submit_bio(&n->bio);
- } else {
- n->bio.bi_status = BLK_STS_REMOVED;
- bio_endio(&n->bio);
- }
- }
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
-
- EBUG_ON(op->open_buckets.nr);
-
- bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- bch2_disk_reservation_put(c, &op->res);
-
- if (!(op->flags & BCH_WRITE_move))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
- bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
- async_object_list_del(c, write_op, op->list_idx);
- if (op->end_io)
- op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct keylist *keys = &op->insert_keys;
- struct bkey_i *src, *dst = keys->keys, *n;
-
- for (src = keys->keys; src != keys->top; src = n) {
- n = bkey_next(src);
-
- if (bkey_extent_is_direct_data(&src->k)) {
- bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
- test_bit(ptr->dev, op->failed.d));
-
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return bch_err_throw(c, data_write_io);
- }
-
- if (dst != src)
- memmove_u64s_down(dst, src, src->k.u64s);
- dst = bkey_next(dst);
- }
-
- keys->top = dst;
- return 0;
-}
-
-/**
- * __bch2_write_index - after a write, update index to point to new data
- * @op: bch_write_op to process
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct keylist *keys = &op->insert_keys;
- unsigned dev;
- int ret = 0;
-
- if (unlikely(op->flags & BCH_WRITE_io_error)) {
- ret = bch2_write_drop_io_error_ptrs(op);
- if (ret)
- goto err;
- }
-
- if (!bch2_keylist_empty(keys)) {
- u64 sectors_start = keylist_sectors(keys);
-
- ret = !(op->flags & BCH_WRITE_move)
- ? bch2_write_index_default(op)
- : bch2_data_update_index_update(op);
-
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
- BUG_ON(keylist_sectors(keys) && !ret);
-
- op->written += sectors_start - keylist_sectors(keys);
-
- if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
- bch2_write_op_error(op, bkey_start_offset(&insert->k),
- "btree update error: %s", bch2_err_str(ret));
- }
-
- if (ret)
- goto err;
- }
-out:
- /* If some a bucket wasn't written, we can't erasure code it: */
- for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
-
- bch2_open_buckets_put(c, &op->open_buckets);
- return;
-err:
- keys->top = keys->keys;
- op->error = ret;
- op->flags |= BCH_WRITE_submitted;
- goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
- if (state != wp->state) {
- struct task_struct *p = current;
- u64 now = ktime_get_ns();
- u64 runtime = p->se.sum_exec_runtime +
- (now - p->se.exec_start);
-
- if (state == WRITE_POINT_runnable)
- wp->last_runtime = runtime;
- else if (wp->state == WRITE_POINT_runnable)
- wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
-
- if (wp->last_state_change &&
- time_after64(now, wp->last_state_change))
- wp->time[wp->state] += now - wp->last_state_change;
- wp->state = state;
- wp->last_state_change = now;
- }
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
- enum write_point_state state;
-
- state = running ? WRITE_POINT_runnable:
- !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
- : WRITE_POINT_stopped;
-
- __wp_update_state(wp, state);
-}
-
-static CLOSURE_CALLBACK(bch2_write_index)
-{
- closure_type(op, struct bch_write_op, cl);
- struct write_point *wp = op->wp;
- struct workqueue_struct *wq = index_update_wq(op);
- unsigned long flags;
-
- if ((op->flags & BCH_WRITE_submitted) &&
- (op->flags & BCH_WRITE_move))
- bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
- spin_lock_irqsave(&wp->writes_lock, flags);
- if (wp->state == WRITE_POINT_waiting_io)
- __wp_update_state(wp, WRITE_POINT_waiting_work);
- list_add_tail(&op->wp_list, &wp->writes);
- spin_unlock_irqrestore (&wp->writes_lock, flags);
-
- queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
- op->wp = wp;
-
- if (wp->state == WRITE_POINT_stopped) {
- spin_lock_irq(&wp->writes_lock);
- __wp_update_state(wp, WRITE_POINT_waiting_io);
- spin_unlock_irq(&wp->writes_lock);
- }
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
- struct write_point *wp =
- container_of(work, struct write_point, index_update_work);
- struct bch_write_op *op;
-
- while (1) {
- spin_lock_irq(&wp->writes_lock);
- op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
- wp_update_state(wp, op != NULL);
- spin_unlock_irq(&wp->writes_lock);
-
- if (!op)
- break;
-
- op->flags |= BCH_WRITE_in_worker;
-
- __bch2_write_index(op);
-
- if (!(op->flags & BCH_WRITE_submitted))
- __bch2_write(op);
- else
- bch2_write_done(&op->cl);
- }
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
- struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->have_ioref
- ? bch2_dev_have_ref(c, wbio->dev)
- : NULL;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
- wbio->submit_time, !bio->bi_status);
-
- if (unlikely(bio->bi_status)) {
- if (ca)
- bch_err_inum_offset_ratelimited(ca,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status));
- else
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status));
- set_bit(wbio->dev, op->failed.d);
- op->flags |= BCH_WRITE_io_error;
- }
-
- if (wbio->nocow) {
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- POS(ca->dev_idx, wbio->nocow_bucket),
- BUCKET_NOCOW_LOCK_UPDATE);
- set_bit(wbio->dev, op->devs_need_flush->d);
- }
-
- if (wbio->have_ioref)
- enumerated_ref_put(&ca->io_ref[WRITE],
- BCH_DEV_WRITE_REF_io_write);
-
- if (wbio->bounce)
- bch2_bio_free_pages_pool(c, bio);
-
- if (wbio->put_bio)
- bio_put(bio);
-
- if (parent)
- bio_endio(&parent->bio);
- else
- closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
- struct write_point *wp,
- struct bversion version,
- struct bch_extent_crc_unpacked crc)
-{
- struct bkey_i_extent *e;
-
- op->pos.offset += crc.uncompressed_size;
-
- e = bkey_extent_init(op->insert_keys.top);
- e->k.p = op->pos;
- e->k.size = crc.uncompressed_size;
- e->k.bversion = version;
-
- if (crc.csum_type ||
- crc.compression_type ||
- crc.nonce)
- bch2_extent_crc_append(&e->k_i, crc);
-
- bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
- op->flags & BCH_WRITE_cached);
-
- if (!(op->flags & BCH_WRITE_move))
- bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
-
- bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
- struct write_point *wp,
- struct bio *src,
- bool *page_alloc_failed,
- void *buf)
-{
- struct bch_write_bio *wbio;
- struct bio *bio;
- unsigned output_available =
- min(wp->sectors_free << 9, src->bi_iter.bi_size);
- unsigned pages = DIV_ROUND_UP(output_available +
- (buf
- ? ((unsigned long) buf & (PAGE_SIZE - 1))
- : 0), PAGE_SIZE);
-
- pages = min(pages, BIO_MAX_VECS);
-
- bio = bio_alloc_bioset(NULL, pages, 0,
- GFP_NOFS, &c->bio_write);
- wbio = wbio_init(bio);
- wbio->put_bio = true;
- /* copy WRITE_SYNC flag */
- wbio->bio.bi_opf = src->bi_opf;
-
- if (buf) {
- bch2_bio_map(bio, buf, output_available);
- return bio;
- }
-
- wbio->bounce = true;
-
- /*
- * We can't use mempool for more than c->sb.encoded_extent_max
- * worth of pages, but we'd like to allocate more if we can:
- */
- bch2_bio_alloc_pages_pool(c, bio,
- min_t(unsigned, output_available,
- c->opts.encoded_extent_max));
-
- if (bio->bi_iter.bi_size < output_available)
- *page_alloc_failed =
- bch2_bio_alloc_pages(bio,
- output_available -
- bio->bi_iter.bi_size,
- GFP_NOFS) != 0;
-
- return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
- struct bch_write_op *op,
- unsigned new_csum_type)
-{
- struct bio *bio = &op->wbio.bio;
- struct bch_extent_crc_unpacked new_crc;
-
- /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
- if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(new_csum_type))
- new_csum_type = op->crc.csum_type;
-
- int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
- if (ret)
- return ret;
-
- bio_advance(bio, op->crc.offset << 9);
- bio->bi_iter.bi_size = op->crc.live_size << 9;
- op->crc = new_crc;
- return 0;
-}
-
-static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
- struct bch_fs *c = op->c;
- struct bio *bio = &op->wbio.bio;
- struct bch_csum csum;
- int ret = 0;
-
- BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
- /* Can we just write the entire extent as is? */
- if (op->crc.uncompressed_size == op->crc.live_size &&
- op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
- op->crc.compressed_size <= wp->sectors_free &&
- (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
- op->incompressible)) {
- if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type) {
- ret = bch2_write_rechecksum(c, op, op->csum_type);
- if (ret)
- return ret;
- }
-
- return 1;
- }
-
- /*
- * If the data is compressed and we couldn't write the entire extent as
- * is, we have to decompress it:
- */
- if (crc_is_compressed(op->crc)) {
- /* Last point we can still verify checksum: */
- struct nonce nonce = extent_nonce(op->version, op->crc);
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- goto csum_err;
-
- if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
- if (ret)
- return ret;
-
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- }
-
- ret = bch2_bio_uncompress_inplace(op, bio);
- if (ret)
- return ret;
- }
-
- /*
- * No longer have compressed data after this point - data might be
- * encrypted:
- */
-
- /*
- * If the data is checksummed and we're only writing a subset,
- * rechecksum and adjust bio to point to currently live data:
- */
- if (op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) {
- ret = bch2_write_rechecksum(c, op, op->csum_type);
- if (ret)
- return ret;
- }
-
- /*
- * If we want to compress the data, it has to be decrypted:
- */
- if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
- (op->compression_opt || op->crc.csum_type != op->csum_type)) {
- struct nonce nonce = extent_nonce(op->version, op->crc);
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- goto csum_err;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
- if (ret)
- return ret;
-
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- }
-
- return 0;
-csum_err:
- bch2_write_op_error(op, op->pos.offset,
- "error verifying existing checksum while moving existing data (memory corruption?)\n"
- " expected %0llx:%0llx got %0llx:%0llx type %s",
- op->crc.csum.hi,
- op->crc.csum.lo,
- csum.hi,
- csum.lo,
- op->crc.csum_type < BCH_CSUM_NR
- ? __bch2_csum_types[op->crc.csum_type]
- : "(unknown)");
- return bch_err_throw(c, data_write_csum);
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
- struct bio **_dst)
-{
- struct bch_fs *c = op->c;
- struct bio *src = &op->wbio.bio, *dst = src;
- struct bvec_iter saved_iter;
- void *ec_buf;
- unsigned total_output = 0, total_input = 0;
- bool bounce = false;
- bool page_alloc_failed = false;
- int ret, more = 0;
-
- if (op->incompressible)
- op->compression_opt = 0;
-
- BUG_ON(!bio_sectors(src));
-
- ec_buf = bch2_writepoint_ec_buf(c, wp);
-
- if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
- ret = bch2_write_prep_encoded_data(op, wp);
- if (ret < 0)
- goto err;
- if (ret) {
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
- }
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
- }
- }
-
- if (ec_buf ||
- op->compression_opt ||
- (op->csum_type &&
- !(op->flags & BCH_WRITE_pages_stable)) ||
- (bch2_csum_type_is_encryption(op->csum_type) &&
- !(op->flags & BCH_WRITE_pages_owned))) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bounce = true;
- }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
- if (!bounce && write_corrupt_ratio) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bounce = true;
- }
-#endif
- saved_iter = dst->bi_iter;
-
- do {
- struct bch_extent_crc_unpacked crc = { 0 };
- struct bversion version = op->version;
- size_t dst_len = 0, src_len = 0;
-
- if (page_alloc_failed &&
- dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
- dst->bi_iter.bi_size < c->opts.encoded_extent_max)
- break;
-
- BUG_ON(op->compression_opt &&
- (op->flags & BCH_WRITE_data_encoded) &&
- bch2_csum_type_is_encryption(op->crc.csum_type));
- BUG_ON(op->compression_opt && !bounce);
-
- crc.compression_type = op->incompressible
- ? BCH_COMPRESSION_TYPE_incompressible
- : op->compression_opt
- ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
- op->compression_opt)
- : 0;
- if (!crc_is_compressed(crc)) {
- dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
- dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
- if (op->csum_type)
- dst_len = min_t(unsigned, dst_len,
- c->opts.encoded_extent_max);
-
- if (bounce) {
- swap(dst->bi_iter.bi_size, dst_len);
- bio_copy_data(dst, src);
- swap(dst->bi_iter.bi_size, dst_len);
- }
-
- src_len = dst_len;
- }
-
- BUG_ON(!src_len || !dst_len);
-
- if (bch2_csum_type_is_encryption(op->csum_type)) {
- if (bversion_zero(version)) {
- version.lo = atomic64_inc_return(&c->key_version);
- } else {
- crc.nonce = op->nonce;
- op->nonce += src_len >> 9;
- }
- }
-
- if ((op->flags & BCH_WRITE_data_encoded) &&
- !crc_is_compressed(crc) &&
- bch2_csum_type_is_encryption(op->crc.csum_type) ==
- bch2_csum_type_is_encryption(op->csum_type)) {
- u8 compression_type = crc.compression_type;
- u16 nonce = crc.nonce;
- /*
- * Note: when we're using rechecksum(), we need to be
- * checksumming @src because it has all the data our
- * existing checksum covers - if we bounced (because we
- * were trying to compress), @dst will only have the
- * part of the data the new checksum will cover.
- *
- * But normally we want to be checksumming post bounce,
- * because part of the reason for bouncing is so the
- * data can't be modified (by userspace) while it's in
- * flight.
- */
- ret = bch2_rechecksum_bio(c, src, version, op->crc,
- &crc, &op->crc,
- src_len >> 9,
- bio_sectors(src) - (src_len >> 9),
- op->csum_type);
- if (ret)
- goto err;
- /*
- * rchecksum_bio sets compression_type on crc from op->crc,
- * this isn't always correct as sometimes we're changing
- * an extent from uncompressed to incompressible.
- */
- crc.compression_type = compression_type;
- crc.nonce = nonce;
- } else {
- if ((op->flags & BCH_WRITE_data_encoded) &&
- (ret = bch2_rechecksum_bio(c, src, version, op->crc,
- NULL, &op->crc,
- src_len >> 9,
- bio_sectors(src) - (src_len >> 9),
- op->crc.csum_type)))
- goto err;
-
- crc.compressed_size = dst_len >> 9;
- crc.uncompressed_size = src_len >> 9;
- crc.live_size = src_len >> 9;
-
- swap(dst->bi_iter.bi_size, dst_len);
- ret = bch2_encrypt_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
- if (ret)
- goto err;
-
- crc.csum = bch2_checksum_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
- crc.csum_type = op->csum_type;
- swap(dst->bi_iter.bi_size, dst_len);
- }
-
- init_append_extent(op, wp, version, crc);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (write_corrupt_ratio) {
- swap(dst->bi_iter.bi_size, dst_len);
- bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
- swap(dst->bi_iter.bi_size, dst_len);
- }
-#endif
-
- if (dst != src)
- bio_advance(dst, dst_len);
- bio_advance(src, src_len);
- total_output += dst_len;
- total_input += src_len;
- } while (dst->bi_iter.bi_size &&
- src->bi_iter.bi_size &&
- wp->sectors_free &&
- !bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX));
-
- more = src->bi_iter.bi_size != 0;
-
- dst->bi_iter = saved_iter;
-
- if (dst == src && more) {
- BUG_ON(total_output != total_input);
-
- dst = bio_split(src, total_input >> 9,
- GFP_NOFS, &c->bio_write);
- wbio_init(dst)->put_bio = true;
- /* copy WRITE_SYNC flag */
- dst->bi_opf = src->bi_opf;
- }
-
- dst->bi_iter.bi_size = total_output;
-do_write:
- *_dst = dst;
- return more;
-err:
- if (to_wbio(dst)->bounce)
- bch2_bio_free_pages_pool(c, dst);
- if (to_wbio(dst)->put_bio)
- bio_put(dst);
-
- return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
- struct bkey_s_c k)
-{
- struct bch_fs *c = op->c;
- struct bkey_s_c_extent e;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
- unsigned replicas = 0;
-
- if (k.k->type != KEY_TYPE_extent)
- return false;
-
- e = bkey_s_c_to_extent(k);
-
- guard(rcu)();
- extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec)
- return false;
-
- replicas += bch2_extent_ptr_durability(c, &p);
- }
-
- return replicas >= op->opts.data_replicas;
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *orig,
- struct bkey_s_c k,
- u64 new_i_size)
-{
- if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
- /* trace this */
- return 0;
- }
-
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bch2_cut_front(bkey_start_pos(&orig->k), new);
- bch2_cut_back(orig->k.p, new);
-
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr(ptrs, ptr)
- ptr->unwritten = 0;
-
- /*
- * Note that we're not calling bch2_subvol_get_snapshot() in this path -
- * that was done when we kicked off the write, and here it's important
- * that we update the extent that we wrote to - even if a snapshot has
- * since been created. The write is still outstanding, so we're ok
- * w.r.t. snapshot atomicity:
- */
- return bch2_extent_update_i_size_sectors(trans, iter,
- min(new->k.p.offset << 9, new_i_size), 0) ?:
- bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_internal_snapshot_node);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
-
- for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
- bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_intent, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
- }));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- bch2_write_op_error(op, bkey_start_offset(&insert->k),
- "btree update error: %s", bch2_err_str(ret));
- }
-
- if (ret)
- op->error = ret;
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
- if (unlikely(op->flags & BCH_WRITE_io_error)) {
- op->error = bch_err_throw(op->c, data_write_io);
- } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
- bch2_nocow_write_convert_unwritten(op);
-}
-
-static CLOSURE_CALLBACK(bch2_nocow_write_done)
-{
- closure_type(op, struct bch_write_op, cl);
-
- __bch2_nocow_write_done(op);
- bch2_write_done(cl);
-}
-
-struct bucket_to_lock {
- struct bpos b;
- unsigned gen;
- struct nocow_lock_bucket *l;
-};
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
- u32 snapshot;
- struct bucket_to_lock *stale_at;
- int stale, ret;
-
- if (op->flags & BCH_WRITE_move)
- return;
-
- darray_init(&buckets);
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
- if (unlikely(ret))
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_slots);
- while (1) {
- struct bio *bio = &op->wbio.bio;
-
- buckets.nr = 0;
-
- ret = bch2_trans_relock(trans);
- if (ret)
- break;
-
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- /* fall back to normal cow write path? */
- if (unlikely(k.k->p.snapshot != snapshot ||
- !bch2_extent_is_writeable(op, k)))
- break;
-
- if (bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- k.k->u64s))
- break;
-
- /* Get iorefs before dropping btree locks: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
- BCH_DEV_WRITE_REF_io_write);
- if (unlikely(!ca))
- goto err_get_ioref;
-
- struct bpos b = PTR_BUCKET_POS(ca, ptr);
- struct nocow_lock_bucket *l =
- bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
- prefetch(l);
-
- /* XXX allocating memory with btree locks held - rare */
- darray_push_gfp(&buckets, ((struct bucket_to_lock) {
- .b = b, .gen = ptr->gen, .l = l,
- }), GFP_KERNEL|__GFP_NOFAIL);
-
- if (ptr->unwritten)
- op->flags |= BCH_WRITE_convert_unwritten;
- }
-
- /* Unlock before taking nocow locks, doing IO: */
- bkey_reassemble(op->insert_keys.top, k);
- bch2_trans_unlock(trans);
-
- bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_convert_unwritten)
- bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
- darray_for_each(buckets, i) {
- struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);
-
- __bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
- bucket_to_u64(i->b),
- BUCKET_NOCOW_LOCK_UPDATE);
-
- int gen = bucket_gen_get(ca, i->b.offset);
- stale = gen < 0 ? gen : gen_after(gen, i->gen);
- if (unlikely(stale)) {
- stale_at = i;
- goto err_bucket_stale;
- }
- }
-
- bio = &op->wbio.bio;
- if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
- bio = bio_split(bio, k.k->p.offset - op->pos.offset,
- GFP_KERNEL, &c->bio_write);
- wbio_init(bio)->put_bio = true;
- bio->bi_opf = op->wbio.bio.bi_opf;
- } else {
- op->flags |= BCH_WRITE_submitted;
- }
-
- op->pos.offset += bio_sectors(bio);
- op->written += bio_sectors(bio);
-
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio->bi_opf |= REQ_OP_WRITE;
- closure_get(&op->cl);
-
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- op->insert_keys.top, true);
-
- bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_submitted)
- break;
- bch2_btree_iter_advance(trans, &iter);
- }
-out:
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
- darray_exit(&buckets);
-
- if (ret) {
- bch2_write_op_error(op, op->pos.offset,
- "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
- op->error = ret;
- op->flags |= BCH_WRITE_submitted;
- }
-
- /* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_submitted)) {
- closure_sync(&op->cl);
- __bch2_nocow_write_done(op);
- op->insert_keys.top = op->insert_keys.keys;
- } else if (op->flags & BCH_WRITE_sync) {
- closure_sync(&op->cl);
- bch2_nocow_write_done(&op->cl.work);
- } else {
- /*
- * XXX
- * needs to run out of process context because ei_quota_lock is
- * a mutex
- */
- continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
- }
- return;
-err_get_ioref:
- darray_for_each(buckets, i)
- enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
- BCH_DEV_WRITE_REF_io_write);
-
- /* Fall back to COW path: */
- goto out;
-err_bucket_stale:
- darray_for_each(buckets, i) {
- bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
- if (i == stale_at)
- break;
- }
-
- struct printbuf buf = PRINTBUF;
- if (bch2_fs_inconsistent_on(stale < 0, c,
- "pointer to invalid bucket in nocow path on device %llu\n %s",
- stale_at->b.inode,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch_err_throw(c, data_write_invalid_ptr);
- } else {
- /* We can retry this: */
- ret = bch_err_throw(c, transaction_restart);
- }
- printbuf_exit(&buf);
-
- goto err_get_ioref;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct write_point *wp = NULL;
- struct bio *bio = NULL;
- unsigned nofs_flags;
- int ret;
-
- nofs_flags = memalloc_nofs_save();
-
- if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
- bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_submitted)
- goto out_nofs_restore;
- }
-again:
- memset(&op->failed, 0, sizeof(op->failed));
-
- do {
- struct bkey_i *key_to_write;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
-
- /* +1 for possible cache device: */
- if (op->open_buckets.nr + op->nr_replicas + 1 >
- ARRAY_SIZE(op->open_buckets.v))
- break;
-
- if (bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX))
- break;
-
- /*
- * The copygc thread is now global, which means it's no longer
- * freeing up space on specific disks, which means that
- * allocations for specific disks may hang arbitrarily long:
- */
- ret = bch2_trans_run(c, lockrestart_do(trans,
- bch2_alloc_sectors_start_trans(trans,
- op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
- op->write_point,
- &op->devs_have,
- op->nr_replicas,
- op->nr_replicas_required,
- op->watermark,
- op->flags,
- &op->cl, &wp)));
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- break;
-
- goto err;
- }
-
- EBUG_ON(!wp);
-
- bch2_open_bucket_get(c, wp, &op->open_buckets);
- ret = bch2_write_extent(op, wp, &bio);
-
- bch2_alloc_sectors_done_inlined(c, wp);
-err:
- if (ret <= 0) {
- op->flags |= BCH_WRITE_submitted;
-
- if (unlikely(ret < 0)) {
- if (!(op->flags & BCH_WRITE_alloc_nowait))
- bch2_write_op_error(op, op->pos.offset,
- "%s(): %s", __func__, bch2_err_str(ret));
- op->error = ret;
- break;
- }
- }
-
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio->bi_opf |= REQ_OP_WRITE;
-
- closure_get(bio->bi_private);
-
- key_to_write = (void *) (op->insert_keys.keys_p +
- key_to_write_offset);
-
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- key_to_write, false);
- } while (ret);
-
- /*
- * Sync or no?
- *
- * If we're running asynchronously, wne may still want to block
- * synchronously here if we weren't able to submit all of the IO at
- * once, as that signals backpressure to the caller.
- */
- if ((op->flags & BCH_WRITE_sync) ||
- (!(op->flags & BCH_WRITE_submitted) &&
- !(op->flags & BCH_WRITE_in_worker))) {
- bch2_wait_on_allocator(c, &op->cl);
-
- __bch2_write_index(op);
-
- if (!(op->flags & BCH_WRITE_submitted))
- goto again;
- bch2_write_done(&op->cl);
- } else {
- bch2_write_queue(op, wp);
- continue_at(&op->cl, bch2_write_index, NULL);
- }
-out_nofs_restore:
- memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
- struct bio *bio = &op->wbio.bio;
- struct bvec_iter iter;
- struct bkey_i_inline_data *id;
- unsigned sectors;
- int ret;
-
- memset(&op->failed, 0, sizeof(op->failed));
-
- op->flags |= BCH_WRITE_wrote_data_inline;
- op->flags |= BCH_WRITE_submitted;
-
- bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
- ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_U64s + DIV_ROUND_UP(data_len, 8));
- if (ret) {
- op->error = ret;
- goto err;
- }
-
- sectors = bio_sectors(bio);
- op->pos.offset += sectors;
-
- id = bkey_inline_data_init(op->insert_keys.top);
- id->k.p = op->pos;
- id->k.bversion = op->version;
- id->k.size = sectors;
-
- iter = bio->bi_iter;
- iter.bi_size = data_len;
- memcpy_from_bio(id->v.data, bio, iter);
-
- while (data_len & 7)
- id->v.data[data_len++] = '\0';
- set_bkey_val_bytes(&id->k, data_len);
- bch2_keylist_push(&op->insert_keys);
-
- __bch2_write_index(op);
-err:
- bch2_write_done(&op->cl);
-}
-
-/**
- * bch2_write() - handle a write to a cache device or flash only volume
- * @cl: &bch_write_op->cl
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-CLOSURE_CALLBACK(bch2_write)
-{
- closure_type(op, struct bch_write_op, cl);
- struct bio *bio = &op->wbio.bio;
- struct bch_fs *c = op->c;
- unsigned data_len;
-
- EBUG_ON(op->cl.parent);
- BUG_ON(!op->nr_replicas);
- BUG_ON(!op->write_point.v);
- BUG_ON(bkey_eq(op->pos, POS_MAX));
-
- async_object_list_add(c, write_op, op, &op->list_idx);
-
- if (op->flags & BCH_WRITE_only_specified_devs)
- op->flags |= BCH_WRITE_alloc_nowait;
-
- op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
- op->start_time = local_clock();
- bch2_keylist_init(&op->insert_keys, op->inline_keys);
- wbio_init(bio)->put_bio = false;
-
- if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
- bch2_write_op_error(op, op->pos.offset, "misaligned write");
- op->error = bch_err_throw(c, data_write_misaligned);
- goto err;
- }
-
- if (c->opts.nochanges) {
- op->error = bch_err_throw(c, erofs_no_writes);
- goto err;
- }
-
- if (!(op->flags & BCH_WRITE_move) &&
- !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
- op->error = bch_err_throw(c, erofs_no_writes);
- goto err;
- }
-
- if (!(op->flags & BCH_WRITE_move))
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
- bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
- data_len = min_t(u64, bio->bi_iter.bi_size,
- op->new_i_size - (op->pos.offset << 9));
-
- if (c->opts.inline_data &&
- data_len <= min(block_bytes(c) / 2, 1024U)) {
- bch2_write_data_inline(op, data_len);
- return;
- }
-
- __bch2_write(op);
- return;
-err:
- bch2_disk_reservation_put(c, &op->res);
-
- closure_debug_destroy(&op->cl);
- async_object_list_del(c, write_op, op->list_idx);
- if (op->end_io)
- op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f) #f,
- BCH_WRITE_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- prt_printf(out, "pos:\t");
- bch2_bpos_to_text(out, op->pos);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "started:\t");
- bch2_pr_time_units(out, local_clock() - op->start_time);
- prt_newline(out);
-
- prt_printf(out, "flags:\t");
- prt_bitflags(out, bch2_write_flags, op->flags);
- prt_newline(out);
-
- prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
- prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
-
- prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
- prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
-
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_fs_io_write_exit(struct bch_fs *c)
-{
- bioset_exit(&c->replica_set);
- bioset_exit(&c->bio_write);
-}
-
-int bch2_fs_io_write_init(struct bch_fs *c)
-{
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
- bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
- return bch_err_throw(c, ENOMEM_bio_write_init);
-
- return 0;
-}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
deleted file mode 100644
index 2c0a8f35ee1f..000000000000
--- a/fs/bcachefs/io_write.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_H
-#define _BCACHEFS_IO_WRITE_H
-
-#include "checksum.h"
-#include "io_write_types.h"
-
-#define to_wbio(_bio) \
- container_of((_bio), struct bch_write_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
- enum bch_data_type, const struct bkey_i *, bool);
-
-__printf(3, 4)
-void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
- return op->watermark == BCH_WATERMARK_copygc
- ? op->c->copygc_wq
- : op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
- struct btree_iter *, struct bkey_i *,
- struct disk_reservation *, u64, s64 *, bool);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_io_opts opts)
-{
- op->c = c;
- op->end_io = NULL;
- op->flags = 0;
- op->written = 0;
- op->error = 0;
- op->csum_type = bch2_data_checksum_type(c, opts);
- op->compression_opt = opts.compression;
- op->nr_replicas = 0;
- op->nr_replicas_required = c->opts.data_replicas_required;
- op->watermark = BCH_WATERMARK_normal;
- op->incompressible = 0;
- op->open_buckets.nr = 0;
- op->devs_have.nr = 0;
- op->target = 0;
- op->opts = opts;
- op->subvol = 0;
- op->pos = POS_MAX;
- op->version = ZERO_VERSION;
- op->write_point = (struct write_point_specifier) { 0 };
- op->res = (struct disk_reservation) { 0 };
- op->new_i_size = U64_MAX;
- op->i_sectors_delta = 0;
- op->devs_need_flush = NULL;
-}
-
-CLOSURE_CALLBACK(bch2_write);
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
- struct bch_write_bio *wbio = to_wbio(bio);
-
- memset(&wbio->wbio, 0, sizeof(wbio->wbio));
- return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-void bch2_fs_io_write_exit(struct bch_fs *);
-int bch2_fs_io_write_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
deleted file mode 100644
index 5da4eb8bb6f6..000000000000
--- a/fs/bcachefs/io_write_types.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_WRITE_TYPES_H
-#define _BCACHEFS_IO_WRITE_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-#define BCH_WRITE_FLAGS() \
- x(alloc_nowait) \
- x(cached) \
- x(data_encoded) \
- x(pages_stable) \
- x(pages_owned) \
- x(only_specified_devs) \
- x(wrote_data_inline) \
- x(check_enospc) \
- x(sync) \
- x(move) \
- x(in_worker) \
- x(submitted) \
- x(io_error) \
- x(convert_unwritten)
-
-enum __bch_write_flags {
-#define x(f) __BCH_WRITE_##f,
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-struct bch_write_bio {
- struct_group(wbio,
- struct bch_fs *c;
- struct bch_write_bio *parent;
-
- u64 submit_time;
- u64 inode_offset;
- u64 nocow_bucket;
-
- struct bch_devs_list failed;
- u8 dev;
-
- unsigned split:1,
- bounce:1,
- put_bio:1,
- have_ioref:1,
- nocow:1,
- used_mempool:1,
- first_btree_write:1;
- );
-
- struct bio bio;
-};
-
-struct bch_write_op {
- struct closure cl;
- struct bch_fs *c;
- void (*end_io)(struct bch_write_op *);
- u64 start_time;
-
-#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
- unsigned list_idx;
-#endif
-
- unsigned written; /* sectors */
- u16 flags;
- s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
-
- unsigned compression_opt:8;
- unsigned csum_type:4;
- unsigned nr_replicas:4;
- unsigned nr_replicas_required:4;
- unsigned watermark:3;
- unsigned incompressible:1;
- unsigned stripe_waited:1;
-
- struct bch_devs_list devs_have;
- u16 target;
- u16 nonce;
- struct bch_io_opts opts;
-
- u32 subvol;
- struct bpos pos;
- struct bversion version;
-
- /* For BCH_WRITE_data_encoded: */
- struct bch_extent_crc_unpacked crc;
-
- struct write_point_specifier write_point;
-
- struct write_point *wp;
- struct list_head wp_list;
-
- struct disk_reservation res;
-
- struct open_buckets open_buckets;
-
- u64 new_i_size;
- s64 i_sectors_delta;
-
- struct bch_devs_mask failed;
-
- struct keylist insert_keys;
- u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
- /*
- * Bitmask of devices that have had nocow writes issued to them since
- * last flush:
- */
- struct bch_devs_mask *devs_need_flush;
-
- /* Must be last: */
- struct bch_write_bio wbio;
-};
-
-#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
deleted file mode 100644
index ddfeb0dafc9d..000000000000
--- a/fs/bcachefs/journal.c
+++ /dev/null
@@ -1,1832 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs journalling code, for btree insertions
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_foreground.h"
-#include "bkey_methods.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "trace.h"
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
- return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
- return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
- return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return __journal_entry_is_open(j->reservations);
-}
-
-static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
-{
- union journal_res_state s = READ_ONCE(j->reservations);
- unsigned i = seq & JOURNAL_BUF_MASK;
- struct journal_buf *buf = j->buf + i;
-
- prt_printf(out, "seq:\t%llu\n", seq);
- printbuf_indent_add(out, 2);
-
- if (!buf->write_started)
- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
-
- struct closure *cl = &buf->io;
- int r = atomic_read(&cl->remaining);
- prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
-
- if (buf->data) {
- prt_printf(out, "size:\t");
- prt_human_readable_u64(out, vstruct_bytes(buf->data));
- prt_newline(out);
- }
-
- prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
-
- prt_printf(out, "flags:\t");
- if (buf->noflush)
- prt_str(out, "noflush ");
- if (buf->must_flush)
- prt_str(out, "must_flush ");
- if (buf->separate_flush)
- prt_str(out, "separate_flush ");
- if (buf->need_flush_to_write_buffer)
- prt_str(out, "need_flush_to_write_buffer ");
- if (buf->write_started)
- prt_str(out, "write_started ");
- if (buf->write_allocated)
- prt_str(out, "write_allocated ");
- if (buf->write_done)
- prt_str(out, "write_done");
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
-}
-
-static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
-{
- lockdep_assert_held(&j->lock);
- out->atomic++;
-
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 24);
-
- for (u64 seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++)
- bch2_journal_buf_to_text(out, j, seq);
- prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
-
- --out->atomic;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
- struct journal_buf *buf = NULL;
-
- EBUG_ON(seq > journal_cur_seq(j));
-
- if (journal_seq_unwritten(j, seq))
- buf = j->buf + (seq & JOURNAL_BUF_MASK);
- return buf;
-}
-
-static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
- INIT_LIST_HEAD(&p->unflushed[i]);
- for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
- INIT_LIST_HEAD(&p->flushed[i]);
- atomic_set(&p->count, count);
- p->devs.nr = 0;
-}
-
-/*
- * Detect stuck journal conditions and trigger shutdown. Technically the journal
- * can end up stuck for a variety of reasons, such as a blocked I/O, journal
- * reservation lockup, etc. Since this is a fatal error with potentially
- * unpredictable characteristics, we want to be fairly conservative before we
- * decide to shut things down.
- *
- * Consider the journal stuck when it appears full with no ability to commit
- * btree transactions, to discard journal buckets, nor acquire priority
- * (reserved watermark) reservation.
- */
-static inline bool
-journal_error_check_stuck(struct journal *j, int error, unsigned flags)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool stuck = false;
- struct printbuf buf = PRINTBUF;
-
- buf.atomic++;
-
- if (!(error == -BCH_ERR_journal_full ||
- error == -BCH_ERR_journal_pin_full) ||
- nr_unwritten_journal_entries(j) ||
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
- return stuck;
-
- spin_lock(&j->lock);
-
- if (j->can_discard) {
- spin_unlock(&j->lock);
- return stuck;
- }
-
- stuck = true;
-
- /*
- * The journal shutdown path will set ->err_seq, but do it here first to
- * serialize against concurrent failures and avoid duplicate error
- * reports.
- */
- if (j->err_seq) {
- spin_unlock(&j->lock);
- return stuck;
- }
- j->err_seq = journal_cur_seq(j);
-
- __bch2_journal_debug_to_text(&buf, j);
- spin_unlock(&j->lock);
- prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"),
- bch2_err_str(error));
- bch2_print_str(c, KERN_ERR, buf.buf);
-
- printbuf_reset(&buf);
- bch2_journal_pins_to_text(&buf, j);
- bch_err(c, "Journal pins:\n%s", buf.buf);
- printbuf_exit(&buf);
-
- bch2_fatal_error(c);
- dump_stack();
-
- return stuck;
-}
-
-void bch2_journal_do_writes(struct journal *j)
-{
- for (u64 seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++) {
- unsigned idx = seq & JOURNAL_BUF_MASK;
- struct journal_buf *w = j->buf + idx;
-
- if (w->write_started && !w->write_allocated)
- break;
- if (w->write_started)
- continue;
-
- if (!journal_state_seq_count(j, j->reservations, seq)) {
- j->seq_write_started = seq;
- w->write_started = true;
- closure_call(&w->io, bch2_journal_write, j->wq, NULL);
- }
-
- break;
- }
-}
-
-/*
- * Final processing when the last reference of a journal buffer has been
- * dropped. Drop the pin list reference acquired at journal entry open and write
- * the buffer, if requested.
- */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq)
-{
- lockdep_assert_held(&j->lock);
-
- if (__bch2_journal_pin_put(j, seq))
- bch2_journal_reclaim_fast(j);
- bch2_journal_do_writes(j);
-
- /*
- * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an
- * open journal entry
- */
- wake_up(&j->wait);
-}
-
-/*
- * Returns true if journal entry is now closed:
- *
- * We don't close a journal_buf until the next journal_buf is finished writing,
- * and can be opened again - this also initializes the next journal_buf:
- */
-static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = journal_cur_buf(j);
- union journal_res_state old, new;
- unsigned sectors;
-
- BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
- closed_val != JOURNAL_ENTRY_ERROR_VAL);
-
- lockdep_assert_held(&j->lock);
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
- new.cur_entry_offset = closed_val;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
- old.cur_entry_offset == new.cur_entry_offset)
- return;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
- if (!__journal_entry_is_open(old))
- return;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)
- old.cur_entry_offset = j->cur_entry_offset_if_blocked;
-
- /* Close out old buffer: */
- buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
-
- if (trace_journal_entry_close_enabled() && trace) {
- struct printbuf pbuf = PRINTBUF;
- pbuf.atomic++;
-
- prt_str(&pbuf, "entry size: ");
- prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
- prt_newline(&pbuf);
- bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
- trace_journal_entry_close(c, pbuf.buf);
- printbuf_exit(&pbuf);
- }
-
- sectors = vstruct_blocks_plus(buf->data, c->block_bits,
- buf->u64s_reserved) << c->block_bits;
- if (unlikely(sectors > buf->sectors)) {
- struct printbuf err = PRINTBUF;
- err.atomic++;
-
- prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
- sectors, buf->sectors);
- prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
- le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
- j->cur_entry_u64s,
- c->block_bits);
- prt_printf(&err, "fatal error - emergency read only");
- bch2_journal_halt_locked(j);
-
- bch_err(c, "%s", err.buf);
- printbuf_exit(&err);
- return;
- }
-
- buf->sectors = sectors;
-
- /*
- * We have to set last_seq here, _before_ opening a new journal entry:
- *
- * A threads may replace an old pin with a new pin on their current
- * journal reservation - the expectation being that the journal will
- * contain either what the old pin protected or what the new pin
- * protects.
- *
- * After the old pin is dropped journal_last_seq() won't include the old
- * pin, so we can only write the updated last_seq on the entry that
- * contains whatever the new pin protects.
- *
- * Restated, we can _not_ update last_seq for a given entry if there
- * could be a newer entry open with reservations/pins that have been
- * taken against it.
- *
- * Hence, we want update/set last_seq on the current journal entry right
- * before we open a new one:
- */
- buf->last_seq = journal_last_seq(j);
- buf->data->last_seq = cpu_to_le64(buf->last_seq);
- BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
-
- cancel_delayed_work(&j->write_work);
-
- bch2_journal_space_available(j);
-
- __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
-}
-
-void bch2_journal_halt_locked(struct journal *j)
-{
- lockdep_assert_held(&j->lock);
-
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
-}
-
-void bch2_journal_halt(struct journal *j)
-{
- spin_lock(&j->lock);
- bch2_journal_halt_locked(j);
- spin_unlock(&j->lock);
-}
-
-static bool journal_entry_want_write(struct journal *j)
-{
- bool ret = !journal_entry_is_open(j) ||
- journal_cur_seq(j) == journal_last_unwritten_seq(j);
-
- /* Don't close it yet if we already have a write in flight: */
- if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
- else if (nr_unwritten_journal_entries(j)) {
- struct journal_buf *buf = journal_cur_buf(j);
-
- if (!buf->flush_time) {
- buf->flush_time = local_clock() ?: 1;
- buf->expires = jiffies;
- }
- }
-
- return ret;
-}
-
-bool bch2_journal_entry_close(struct journal *j)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = journal_entry_want_write(j);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- */
-static int journal_entry_open(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = j->buf +
- ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
- union journal_res_state old, new;
- int u64s;
-
- lockdep_assert_held(&j->lock);
- BUG_ON(journal_entry_is_open(j));
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
- if (j->blocked)
- return bch_err_throw(c, journal_blocked);
-
- if (j->cur_entry_error)
- return j->cur_entry_error;
-
- int ret = bch2_journal_error(j);
- if (unlikely(ret))
- return ret;
-
- if (!fifo_free(&j->pin))
- return bch_err_throw(c, journal_pin_full);
-
- if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
- return bch_err_throw(c, journal_max_in_flight);
-
- if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
- return bch_err_throw(c, journal_max_open);
-
- if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
- bch_err(c, "cannot start: journal seq overflow");
- if (bch2_fs_emergency_read_only_locked(c))
- bch_err(c, "fatal error - emergency read only");
- return bch_err_throw(c, journal_shutdown);
- }
-
- if (!j->free_buf && !buf->data)
- return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
-
- BUG_ON(!j->cur_entry_sectors);
-
- if (!buf->data) {
- swap(buf->data, j->free_buf);
- swap(buf->buf_size, j->free_buf_size);
- }
-
- buf->expires =
- (journal_cur_seq(j) == j->flushed_seq_ondisk
- ? jiffies
- : j->last_flush_write) +
- msecs_to_jiffies(c->opts.journal_flush_delay);
-
- buf->u64s_reserved = j->entry_u64s_reserved;
- buf->disk_sectors = j->cur_entry_sectors;
- buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
-
- u64s = (int) (buf->sectors << 9) / sizeof(u64) -
- journal_entry_overhead(j);
- u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
-
- if (u64s <= (ssize_t) j->early_journal_entries.nr)
- return bch_err_throw(c, journal_full);
-
- if (fifo_empty(&j->pin) && j->reclaim_thread)
- wake_up_process(j->reclaim_thread);
-
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-
- if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) {
- bch_err(c, "attempting to open blacklisted journal seq %llu",
- journal_cur_seq(j));
- if (bch2_fs_emergency_read_only_locked(c))
- bch_err(c, "fatal error - emergency read only");
- return bch_err_throw(c, journal_shutdown);
- }
-
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
- BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
-
- bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
- buf->flush_time = 0;
- buf->need_flush_to_write_buffer = true;
- buf->write_started = false;
- buf->write_allocated = false;
- buf->write_done = false;
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(journal_cur_seq(j));
- buf->data->u64s = 0;
-
- if (j->early_journal_entries.nr) {
- memcpy(buf->data->_data, j->early_journal_entries.data,
- j->early_journal_entries.nr * sizeof(u64));
- le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
- }
-
- /*
- * Must be set before marking the journal entry as open:
- */
- j->cur_entry_u64s = u64s;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
-
- BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
-
- new.idx++;
- BUG_ON(journal_state_count(new, new.idx));
- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
-
- journal_state_inc(&new);
-
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
- if (nr_unwritten_journal_entries(j) == 1)
- mod_delayed_work(j->wq,
- &j->write_work,
- msecs_to_jiffies(c->opts.journal_flush_delay));
- journal_wake(j);
-
- if (j->early_journal_entries.nr)
- darray_exit(&j->early_journal_entries);
- return 0;
-}
-
-static bool journal_quiesced(struct journal *j)
-{
- bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
-
- if (!ret)
- bch2_journal_entry_close(j);
- return ret;
-}
-
-static void journal_quiesce(struct journal *j)
-{
- wait_event(j->wait, journal_quiesced(j));
-}
-
-static void journal_write_work(struct work_struct *work)
-{
- struct journal *j = container_of(work, struct journal, write_work.work);
-
- spin_lock(&j->lock);
- if (__journal_entry_is_open(j->reservations)) {
- long delta = journal_cur_buf(j)->expires - jiffies;
-
- if (delta > 0)
- mod_delayed_work(j->wq, &j->write_work, delta);
- else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
- }
- spin_unlock(&j->lock);
-}
-
-static void journal_buf_prealloc(struct journal *j)
-{
- if (j->free_buf &&
- j->free_buf_size >= j->buf_size_want)
- return;
-
- unsigned buf_size = j->buf_size_want;
-
- spin_unlock(&j->lock);
- void *buf = kvmalloc(buf_size, GFP_NOFS);
- spin_lock(&j->lock);
-
- if (buf &&
- (!j->free_buf ||
- buf_size > j->free_buf_size)) {
- swap(buf, j->free_buf);
- swap(buf_size, j->free_buf_size);
- }
-
- if (unlikely(buf)) {
- spin_unlock(&j->lock);
- /* kvfree can sleep */
- kvfree(buf);
- spin_lock(&j->lock);
- }
-}
-
-static int __journal_res_get(struct journal *j, struct journal_res *res,
- unsigned flags)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf;
- bool can_discard;
- int ret;
-retry:
- if (journal_res_get_fast(j, res, flags))
- return 0;
-
- ret = bch2_journal_error(j);
- if (unlikely(ret))
- return ret;
-
- if (j->blocked)
- return bch_err_throw(c, journal_blocked);
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- ret = bch_err_throw(c, journal_full);
- can_discard = j->can_discard;
- goto out;
- }
-
- if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
- ret = bch_err_throw(c, journal_max_in_flight);
- goto out;
- }
-
- spin_lock(&j->lock);
-
- journal_buf_prealloc(j);
-
- /*
- * Recheck after taking the lock, so we don't race with another thread
- * that just did journal_entry_open() and call bch2_journal_entry_close()
- * unnecessarily
- */
- if (journal_res_get_fast(j, res, flags)) {
- ret = 0;
- goto unlock;
- }
-
- /*
- * If we couldn't get a reservation because the current buf filled up,
- * and we had room for a bigger entry on disk, signal that we want to
- * realloc the journal bufs:
- */
- buf = journal_cur_buf(j);
- if (journal_entry_is_open(j) &&
- buf->buf_size >> 9 < buf->disk_sectors &&
- buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
- j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
-
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
-unlock:
- can_discard = j->can_discard;
- spin_unlock(&j->lock);
-out:
- if (likely(!ret))
- return 0;
- if (ret == -BCH_ERR_journal_retry_open)
- goto retry;
-
- if (journal_error_check_stuck(j, ret, flags))
- ret = bch_err_throw(c, journal_stuck);
-
- if (ret == -BCH_ERR_journal_max_in_flight &&
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
- trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_printbuf_make_room(&buf, 4096);
-
- spin_lock(&j->lock);
- prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
- bch2_journal_bufs_to_text(&buf, j);
- spin_unlock(&j->lock);
-
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- count_event(c, journal_entry_full);
- }
-
- if (ret == -BCH_ERR_journal_max_open &&
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
- trace_journal_entry_full_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_printbuf_make_room(&buf, 4096);
-
- spin_lock(&j->lock);
- prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
- bch2_journal_bufs_to_text(&buf, j);
- spin_unlock(&j->lock);
-
- trace_journal_entry_full(c, buf.buf);
- printbuf_exit(&buf);
- count_event(c, journal_entry_full);
- }
-
- /*
- * Journal is full - can't rely on reclaim from work item due to
- * freezing:
- */
- if ((ret == -BCH_ERR_journal_full ||
- ret == -BCH_ERR_journal_pin_full) &&
- !(flags & JOURNAL_RES_GET_NONBLOCK)) {
- if (can_discard) {
- bch2_journal_do_discards(j);
- goto retry;
- }
-
- if (mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
- }
-
- return ret;
-}
-
-static unsigned max_dev_latency(struct bch_fs *c)
-{
- u64 nsecs = 0;
-
- guard(rcu)();
- for_each_rw_member_rcu(c, ca)
- nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
-
- return nsecs_to_jiffies(nsecs);
-}
-
-/*
- * Essentially the entry function to the journaling code. When bcachefs is doing
- * a btree insert, it calls this function to get the current journal write.
- * Journal write is the structure used set up journal writes. The calling
- * function will then add its keys to the structure, queuing them for the next
- * write.
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks.
- */
-int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
- unsigned flags,
- struct btree_trans *trans)
-{
- int ret;
-
- if (closure_wait_event_timeout(&j->async_wait,
- !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
- (flags & JOURNAL_RES_GET_NONBLOCK),
- HZ))
- return ret;
-
- if (trans)
- bch2_trans_unlock_long(trans);
-
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
-
- remaining_wait = max(0, remaining_wait - HZ);
-
- if (closure_wait_event_timeout(&j->async_wait,
- !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
- (flags & JOURNAL_RES_GET_NONBLOCK),
- remaining_wait))
- return ret;
-
- struct printbuf buf = PRINTBUF;
- bch2_journal_debug_to_text(&buf, j);
- bch2_print_str(c, KERN_ERR, buf.buf);
- prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
- printbuf_exit(&buf);
-
- closure_wait_event(&j->async_wait,
- !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
- (flags & JOURNAL_RES_GET_NONBLOCK));
- return ret;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *j,
- struct journal_entry_res *res,
- unsigned new_u64s)
-{
- union journal_res_state state;
- int d = new_u64s - res->u64s;
-
- spin_lock(&j->lock);
-
- j->entry_u64s_reserved += d;
- if (d <= 0)
- goto out;
-
- j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
- state = READ_ONCE(j->reservations);
-
- if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
- state.cur_entry_offset > j->cur_entry_u64s) {
- j->cur_entry_u64s += d;
- /*
- * Not enough room in current journal entry, have to flush it:
- */
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
- } else {
- journal_cur_buf(j)->u64s_reserved += d;
- }
-out:
- spin_unlock(&j->lock);
- res->u64s += d;
-}
-
-/* journal flushing: */
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- * @j: journal object
- * @seq: seq to flush
- * @parent: closure object to wait with
- * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed,
- * -BCH_ERR_journal_flush_err if @seq will never be flushed
- *
- * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf;
- int ret = 0;
-
- if (seq <= j->flushed_seq_ondisk)
- return 1;
-
- spin_lock(&j->lock);
-
- if (WARN_ONCE(seq > journal_cur_seq(j),
- "requested to flush journal seq %llu, but currently at %llu",
- seq, journal_cur_seq(j)))
- goto out;
-
- /* Recheck under lock: */
- if (j->err_seq && seq >= j->err_seq) {
- ret = bch_err_throw(c, journal_flush_err);
- goto out;
- }
-
- if (seq <= j->flushed_seq_ondisk) {
- ret = 1;
- goto out;
- }
-
- /* if seq was written, but not flushed - flush a newer one instead */
- seq = max(seq, journal_last_unwritten_seq(j));
-
-recheck_need_open:
- if (seq > journal_cur_seq(j)) {
- struct journal_res res = { 0 };
-
- if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-
- spin_unlock(&j->lock);
-
- /*
- * We're called from bch2_journal_flush_seq() -> wait_event();
- * but this might block. We won't usually block, so we won't
- * livelock:
- */
- sched_annotate_sleep();
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
- if (ret)
- return ret;
-
- seq = res.seq;
- buf = journal_seq_to_buf(j, seq);
- buf->must_flush = true;
-
- if (!buf->flush_time) {
- buf->flush_time = local_clock() ?: 1;
- buf->expires = jiffies;
- }
-
- if (parent && !closure_wait(&buf->wait, parent))
- BUG();
-
- bch2_journal_res_put(j, &res);
-
- spin_lock(&j->lock);
- goto want_write;
- }
-
- /*
- * if write was kicked off without a flush, or if we promised it
- * wouldn't be a flush, flush the next sequence number instead
- */
- buf = journal_seq_to_buf(j, seq);
- if (buf->noflush) {
- seq++;
- goto recheck_need_open;
- }
-
- buf->must_flush = true;
- j->flushing_seq = max(j->flushing_seq, seq);
-
- if (parent && !closure_wait(&buf->wait, parent))
- BUG();
-want_write:
- if (seq == journal_cur_seq(j))
- journal_entry_want_write(j);
-out:
- spin_unlock(&j->lock);
- return ret;
-}
-
-int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
-{
- u64 start_time = local_clock();
- int ret, ret2;
-
- /*
- * Don't update time_stats when @seq is already flushed:
- */
- if (seq <= j->flushed_seq_ondisk)
- return 0;
-
- ret = wait_event_state(j->wait,
- (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
- task_state);
-
- if (!ret)
- bch2_time_stats_update(j->flush_seq_time, start_time);
-
- return ret ?: ret2 < 0 ? ret2 : 0;
-}
-
-/*
- * bch2_journal_flush_async - if there is an open journal entry, or a journal
- * still being written, write it and wait for the write to complete
- */
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
-{
- bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
-}
-
-int bch2_journal_flush(struct journal *j)
-{
- return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the
- * range [start, end)
- * @seq
- */
-bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- u64 unwritten_seq;
- bool ret = false;
-
- if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
- return false;
-
- if (c->journal.flushed_seq_ondisk >= start)
- return false;
-
- spin_lock(&j->lock);
- if (c->journal.flushed_seq_ondisk >= start)
- goto out;
-
- for (unwritten_seq = journal_last_unwritten_seq(j);
- unwritten_seq < end;
- unwritten_seq++) {
- struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
-
- /* journal flush already in flight, or flush requseted */
- if (buf->must_flush)
- goto out;
-
- buf->noflush = true;
- }
-
- ret = true;
-out:
- spin_unlock(&j->lock);
- return ret;
-}
-
-static int __bch2_journal_meta(struct journal *j)
-{
- struct journal_res res = {};
- int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
- if (ret)
- return ret;
-
- struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
- buf->must_flush = true;
-
- if (!buf->flush_time) {
- buf->flush_time = local_clock() ?: 1;
- buf->expires = jiffies;
- }
-
- bch2_journal_res_put(j, &res);
-
- return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
-}
-
-int bch2_journal_meta(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
- return bch_err_throw(c, erofs_no_writes);
-
- int ret = __bch2_journal_meta(j);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
- return ret;
-}
-
-/* block/unlock the journal: */
-
-void bch2_journal_unblock(struct journal *j)
-{
- spin_lock(&j->lock);
- if (!--j->blocked &&
- j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) {
- union journal_res_state old, new;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
- new.cur_entry_offset = j->cur_entry_offset_if_blocked;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
- }
- spin_unlock(&j->lock);
-
- journal_wake(j);
-}
-
-static void __bch2_journal_block(struct journal *j)
-{
- if (!j->blocked++) {
- union journal_res_state old, new;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- j->cur_entry_offset_if_blocked = old.cur_entry_offset;
-
- if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL)
- break;
-
- new.v = old.v;
- new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
-
- if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
- }
-}
-
-void bch2_journal_block(struct journal *j)
-{
- spin_lock(&j->lock);
- __bch2_journal_block(j);
- spin_unlock(&j->lock);
-
- journal_quiesce(j);
-}
-
-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j,
- u64 max_seq, bool *blocked)
-{
- struct journal_buf *ret = NULL;
-
- /* We're inside wait_event(), but using mutex_lock(: */
- sched_annotate_sleep();
- mutex_lock(&j->buf_lock);
- spin_lock(&j->lock);
- max_seq = min(max_seq, journal_cur_seq(j));
-
- for (u64 seq = journal_last_unwritten_seq(j);
- seq <= max_seq;
- seq++) {
- unsigned idx = seq & JOURNAL_BUF_MASK;
- struct journal_buf *buf = j->buf + idx;
-
- if (buf->need_flush_to_write_buffer) {
- union journal_res_state s;
- s.v = atomic64_read_acquire(&j->reservations.counter);
-
- unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s);
-
- if (open && !*blocked) {
- __bch2_journal_block(j);
- s.v = atomic64_read_acquire(&j->reservations.counter);
- *blocked = true;
- }
-
- ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
- ? ERR_PTR(-EAGAIN)
- : buf;
- break;
- }
- }
-
- spin_unlock(&j->lock);
- if (IS_ERR_OR_NULL(ret))
- mutex_unlock(&j->buf_lock);
- return ret;
-}
-
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
- u64 max_seq, bool *blocked)
-{
- struct journal_buf *ret;
- *blocked = false;
-
- wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j,
- max_seq, blocked)) != ERR_PTR(-EAGAIN));
- if (IS_ERR_OR_NULL(ret) && *blocked)
- bch2_journal_unblock(j);
-
- return ret;
-}
-
-/* allocate journal on a device: */
-
-static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
- bool new_fs, struct closure *cl)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- u64 *new_bucket_seq = NULL, *new_buckets = NULL;
- struct open_bucket **ob = NULL;
- long *bu = NULL;
- unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
- int ret = 0;
-
- BUG_ON(nr <= ja->nr);
-
- bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
- ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
- new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
- if (!bu || !ob || !new_buckets || !new_bucket_seq) {
- ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
- goto err_free;
- }
-
- for (nr_got = 0; nr_got < nr_want; nr_got++) {
- enum bch_watermark watermark = new_fs
- ? BCH_WATERMARK_btree
- : BCH_WATERMARK_normal;
-
- ob[nr_got] = bch2_bucket_alloc(c, ca, watermark,
- BCH_DATA_journal, cl);
- ret = PTR_ERR_OR_ZERO(ob[nr_got]);
- if (ret)
- break;
-
- if (!new_fs) {
- ret = bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(trans, ca,
- ob[nr_got]->bucket, BCH_DATA_journal,
- ca->mi.bucket_size, BTREE_TRIGGER_transactional));
- if (ret) {
- bch2_open_bucket_put(c, ob[nr_got]);
- bch_err_msg(c, ret, "marking new journal buckets");
- break;
- }
- }
-
- bu[nr_got] = ob[nr_got]->bucket;
- }
-
- if (!nr_got)
- goto err_free;
-
- /* Don't return an error if we successfully allocated some buckets: */
- ret = 0;
-
- if (c) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_block(&c->journal);
- mutex_lock(&c->sb_lock);
- }
-
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
-
- BUG_ON(ja->discard_idx > ja->nr);
-
- pos = ja->discard_idx ?: ja->nr;
-
- memmove(new_buckets + pos + nr_got,
- new_buckets + pos,
- sizeof(new_buckets[0]) * (ja->nr - pos));
- memmove(new_bucket_seq + pos + nr_got,
- new_bucket_seq + pos,
- sizeof(new_bucket_seq[0]) * (ja->nr - pos));
-
- for (i = 0; i < nr_got; i++) {
- new_buckets[pos + i] = bu[i];
- new_bucket_seq[pos + i] = 0;
- }
-
- nr = ja->nr + nr_got;
-
- ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
- if (ret)
- goto err_unblock;
-
- bch2_write_super(c);
-
- /* Commit: */
- if (c)
- spin_lock(&c->journal.lock);
-
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
- ja->nr = nr;
-
- if (pos <= ja->discard_idx)
- ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
- if (pos <= ja->dirty_idx_ondisk)
- ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
- if (pos <= ja->dirty_idx)
- ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
- if (pos <= ja->cur_idx)
- ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
-
- if (c)
- spin_unlock(&c->journal.lock);
-err_unblock:
- if (c) {
- bch2_journal_unblock(&c->journal);
- mutex_unlock(&c->sb_lock);
- }
-
- if (ret && !new_fs)
- for (i = 0; i < nr_got; i++)
- bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(trans, ca,
- bu[i], BCH_DATA_free, 0,
- BTREE_TRIGGER_transactional));
-err_free:
- for (i = 0; i < nr_got; i++)
- bch2_open_bucket_put(c, ob[i]);
-
- kfree(new_bucket_seq);
- kfree(new_buckets);
- kfree(ob);
- kfree(bu);
- return ret;
-}
-
-static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr, bool new_fs)
-{
- struct journal_device *ja = &ca->journal;
- int ret = 0;
-
- struct closure cl;
- closure_init_stack(&cl);
-
- /* don't handle reducing nr of buckets yet: */
- if (nr < ja->nr)
- return 0;
-
- while (!ret && ja->nr < nr) {
- struct disk_reservation disk_res = { 0, 0, 0 };
-
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- *
- * XXX: that's not right, disk reservations only ensure a
- * filesystem-wide allocation will succeed, this is a device
- * specific allocation - we can hang here:
- */
- if (!new_fs) {
- ret = bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0);
- if (ret)
- break;
- }
-
- ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
-
- if (ret == -BCH_ERR_bucket_alloc_blocked ||
- ret == -BCH_ERR_open_buckets_empty)
- ret = 0; /* wait and retry */
-
- bch2_disk_reservation_put(c, &disk_res);
- bch2_wait_on_allocator(c, &cl);
- }
-
- return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
-{
- down_write(&c->state_lock);
- int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
- up_write(&c->state_lock);
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
-{
- struct bch_fs *c = ca->fs;
- struct journal *j = &c->journal;
- struct journal_device *ja = &ca->journal;
-
- guard(mutex)(&c->sb_lock);
- unsigned pos;
- for (pos = 0; pos < ja->nr; pos++)
- if (ja->buckets[pos] == b)
- break;
-
- if (pos == ja->nr) {
- bch_err(ca, "journal bucket %llu not found when deleting", b);
- return -EINVAL;
- }
-
- u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
- if (!new_buckets)
- return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
-
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memmove(&new_buckets[pos],
- &new_buckets[pos + 1],
- (ja->nr - 1 - pos) * sizeof(new_buckets[0]));
-
- int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
- bch2_write_super(c);
- if (ret) {
- kfree(new_buckets);
- return ret;
- }
-
- scoped_guard(spinlock, &j->lock) {
- if (pos < ja->discard_idx)
- --ja->discard_idx;
- if (pos < ja->dirty_idx_ondisk)
- --ja->dirty_idx_ondisk;
- if (pos < ja->dirty_idx)
- --ja->dirty_idx;
- if (pos < ja->cur_idx)
- --ja->cur_idx;
-
- ja->nr--;
-
- memmove(&ja->buckets[pos],
- &ja->buckets[pos + 1],
- (ja->nr - pos) * sizeof(ja->buckets[0]));
-
- memmove(&ja->bucket_seq[pos],
- &ja->bucket_seq[pos + 1],
- (ja->nr - pos) * sizeof(ja->bucket_seq[0]));
-
- bch2_journal_space_available(j);
- }
-
- kfree(new_buckets);
- return 0;
-}
-
-int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
-{
- struct bch_fs *c = ca->fs;
-
- if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal)))
- return 0;
-
- if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
- bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
- return bch_err_throw(c, erofs_filesystem_full);
- }
-
- unsigned nr;
- int ret;
-
- if (dynamic_fault("bcachefs:add:journal_alloc")) {
- ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
- goto err;
- }
-
- /* 1/128th of the device by default: */
- nr = ca->mi.nbuckets >> 7;
-
- /*
- * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
- * is smaller:
- */
- nr = clamp_t(unsigned, nr,
- BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 13,
- (1 << 24) / ca->mi.bucket_size));
-
- ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
-err:
- bch_err_fn(ca, ret);
- return ret;
-}
-
-int bch2_fs_journal_alloc(struct bch_fs *c)
-{
- for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) {
- if (ca->journal.nr)
- continue;
-
- int ret = bch2_dev_journal_alloc(ca, true);
- if (ret) {
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_fs_journal_alloc);
- return ret;
- }
- }
-
- return 0;
-}
-
-/* startup/shutdown: */
-
-static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
-{
- bool ret = false;
- u64 seq;
-
- spin_lock(&j->lock);
- for (seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j) && !ret;
- seq++) {
- struct journal_buf *buf = journal_seq_to_buf(j, seq);
-
- if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
- ret = true;
- }
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
-{
- wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
-}
-
-void bch2_fs_journal_stop(struct journal *j)
-{
- if (!test_bit(JOURNAL_running, &j->flags))
- return;
-
- bch2_journal_reclaim_stop(j);
- bch2_journal_flush_all_pins(j);
-
- wait_event(j->wait, bch2_journal_entry_close(j));
-
- /*
- * Always write a new journal entry, to make sure the clock hands are up
- * to date (and match the superblock)
- */
- __bch2_journal_meta(j);
-
- journal_quiesce(j);
- cancel_delayed_work_sync(&j->write_work);
-
- WARN(!bch2_journal_error(j) &&
- test_bit(JOURNAL_replay_done, &j->flags) &&
- j->last_empty_seq != journal_cur_seq(j),
- "journal shutdown error: cur seq %llu but last empty seq %llu",
- journal_cur_seq(j), j->last_empty_seq);
-
- if (!bch2_journal_error(j))
- clear_bit(JOURNAL_running, &j->flags);
-}
-
-int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin_list *p;
- struct journal_replay *i, **_i;
- struct genradix_iter iter;
- bool had_entries = false;
-
- /*
- *
- * XXX pick most recent non blacklisted sequence number
- */
-
- cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c));
-
- if (cur_seq >= JOURNAL_SEQ_MAX) {
- bch_err(c, "cannot start: journal seq overflow");
- return -EINVAL;
- }
-
- /* Clean filesystem? */
- if (!last_seq)
- last_seq = cur_seq;
-
- u64 nr = cur_seq - last_seq;
-
- /*
- * Extra fudge factor, in case we crashed when the journal pin fifo was
- * nearly or completely full. We'll need to be able to open additional
- * journal entries (at least a few) in order for journal replay to get
- * going:
- */
- nr += nr / 4;
-
- nr = max(nr, JOURNAL_PIN);
- init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return bch_err_throw(c, ENOMEM_journal_pin_fifo);
- }
-
- j->replay_journal_seq = last_seq;
- j->replay_journal_seq_end = cur_seq;
- j->last_seq_ondisk = last_seq;
- j->flushed_seq_ondisk = cur_seq - 1;
- j->seq_write_started = cur_seq - 1;
- j->seq_ondisk = cur_seq - 1;
- j->pin.front = last_seq;
- j->pin.back = cur_seq;
- atomic64_set(&j->seq, cur_seq - 1);
-
- u64 seq;
- fifo_for_each_entry_ptr(p, &j->pin, seq)
- journal_pin_list_init(p, 1);
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- seq = le64_to_cpu(i->j.seq);
- BUG_ON(seq >= cur_seq);
-
- if (seq < last_seq)
- continue;
-
- if (journal_entry_empty(&i->j))
- j->last_empty_seq = le64_to_cpu(i->j.seq);
-
- p = journal_seq_pin(j, seq);
-
- p->devs.nr = 0;
- darray_for_each(i->ptrs, ptr)
- bch2_dev_list_add_dev(&p->devs, ptr->dev);
-
- had_entries = true;
- }
-
- if (!had_entries)
- j->last_empty_seq = cur_seq - 1; /* to match j->seq */
-
- spin_lock(&j->lock);
- j->last_flush_write = jiffies;
-
- j->reservations.idx = journal_cur_seq(j);
-
- c->last_bucket_seq_cleanup = journal_cur_seq(j);
- spin_unlock(&j->lock);
-
- return 0;
-}
-
-void bch2_journal_set_replay_done(struct journal *j)
-{
- /*
- * journal_space_available must happen before setting JOURNAL_running
- * JOURNAL_running must happen before JOURNAL_replay_done
- */
- spin_lock(&j->lock);
- bch2_journal_space_available(j);
-
- set_bit(JOURNAL_need_flush_write, &j->flags);
- set_bit(JOURNAL_running, &j->flags);
- set_bit(JOURNAL_replay_done, &j->flags);
- spin_unlock(&j->lock);
-}
-
-/* init/exit: */
-
-void bch2_dev_journal_exit(struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
-
- for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- kfree(ja->bio[i]);
- ja->bio[i] = NULL;
- }
-
- kfree(ja->buckets);
- kfree(ja->bucket_seq);
- ja->buckets = NULL;
- ja->bucket_seq = NULL;
-}
-
-int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets =
- bch2_sb_field_get(sb, journal);
- struct bch_sb_field_journal_v2 *journal_buckets_v2 =
- bch2_sb_field_get(sb, journal_v2);
-
- ja->nr = 0;
-
- if (journal_buckets_v2) {
- unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-
- for (unsigned i = 0; i < nr; i++)
- ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
- } else if (journal_buckets) {
- ja->nr = bch2_nr_journal_buckets(journal_buckets);
- }
-
- ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
- if (!ja->bucket_seq)
- return bch_err_throw(c, ENOMEM_dev_journal_init);
-
- unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
-
- for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
- nr_bvecs), GFP_KERNEL);
- if (!ja->bio[i])
- return bch_err_throw(c, ENOMEM_dev_journal_init);
-
- ja->bio[i]->ca = ca;
- ja->bio[i]->buf_idx = i;
- bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
- }
-
- ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
- if (!ja->buckets)
- return bch_err_throw(c, ENOMEM_dev_journal_init);
-
- if (journal_buckets_v2) {
- unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
- unsigned dst = 0;
-
- for (unsigned i = 0; i < nr; i++)
- for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
- ja->buckets[dst++] =
- le64_to_cpu(journal_buckets_v2->d[i].start) + j;
- } else if (journal_buckets) {
- for (unsigned i = 0; i < ja->nr; i++)
- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
- }
-
- return 0;
-}
-
-void bch2_fs_journal_exit(struct journal *j)
-{
- if (j->wq)
- destroy_workqueue(j->wq);
-
- darray_exit(&j->early_journal_entries);
-
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
- kvfree(j->buf[i].data);
- kvfree(j->free_buf);
- free_fifo(&j->pin);
-}
-
-void bch2_fs_journal_init_early(struct journal *j)
-{
- static struct lock_class_key res_key;
-
- mutex_init(&j->buf_lock);
- spin_lock_init(&j->lock);
- spin_lock_init(&j->err_lock);
- init_waitqueue_head(&j->wait);
- INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- init_waitqueue_head(&j->reclaim_wait);
- init_waitqueue_head(&j->pin_flush_wait);
- mutex_init(&j->reclaim_lock);
- mutex_init(&j->discard_lock);
-
- lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
- atomic64_set(&j->reservations.counter,
- ((union journal_res_state)
- { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-}
-
-int bch2_fs_journal_init(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
- j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
- if (!j->free_buf)
- return bch_err_throw(c, ENOMEM_journal_buf);
-
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
- j->buf[i].idx = i;
-
- j->wq = alloc_workqueue("bcachefs_journal",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
- if (!j->wq)
- return bch_err_throw(c, ENOMEM_fs_other_alloc);
- return 0;
-}
-
-/* debug: */
-
-static const char * const bch2_journal_flags_strs[] = {
-#define x(n) #n,
- JOURNAL_FLAGS()
-#undef x
- NULL
-};
-
-void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union journal_res_state s;
- unsigned long now = jiffies;
- u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 28);
- out->atomic++;
-
- guard(rcu)();
- s = READ_ONCE(j->reservations);
-
- prt_printf(out, "flags:\t");
- prt_bitflags(out, bch2_journal_flags_strs, j->flags);
- prt_newline(out);
- prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
- prt_printf(out, "seq:\t%llu\n", journal_cur_seq(j));
- prt_printf(out, "seq_ondisk:\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t%llu\n", journal_last_seq(j));
- prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
- prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "average write size:\t");
- prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
- prt_newline(out);
- prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
- prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
- ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "blocked:\t%u\n", j->blocked);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error));
- prt_printf(out, "current entry:\t");
-
- switch (s.cur_entry_offset) {
- case JOURNAL_ENTRY_ERROR_VAL:
- prt_printf(out, "error\n");
- break;
- case JOURNAL_ENTRY_CLOSED_VAL:
- prt_printf(out, "closed\n");
- break;
- case JOURNAL_ENTRY_BLOCKED_VAL:
- prt_printf(out, "blocked\n");
- break;
- default:
- prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
- break;
- }
-
- prt_printf(out, "unwritten entries:\n");
- bch2_journal_bufs_to_text(out, j);
-
- prt_printf(out, "space:\n");
- printbuf_indent_add(out, 2);
- prt_printf(out, "discarded\t%u:%u\n",
- j->space[journal_space_discarded].next_entry,
- j->space[journal_space_discarded].total);
- prt_printf(out, "clean ondisk\t%u:%u\n",
- j->space[journal_space_clean_ondisk].next_entry,
- j->space[journal_space_clean_ondisk].total);
- prt_printf(out, "clean\t%u:%u\n",
- j->space[journal_space_clean].next_entry,
- j->space[journal_space_clean].total);
- prt_printf(out, "total\t%u:%u\n",
- j->space[journal_space_total].next_entry,
- j->space[journal_space_total].total);
- printbuf_indent_sub(out, 2);
-
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- if (!ca->mi.durability)
- continue;
-
- struct journal_device *ja = &ca->journal;
-
- if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
- continue;
-
- if (!ja->nr)
- continue;
-
- prt_printf(out, "dev %u:\n", ca->dev_idx);
- prt_printf(out, "durability %u:\n", ca->mi.durability);
- printbuf_indent_add(out, 2);
- prt_printf(out, "nr\t%u\n", ja->nr);
- prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size);
- prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- prt_printf(out, "discard_idx\t%u\n", ja->discard_idx);
- prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
- printbuf_indent_sub(out, 2);
- }
-
- prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
-
- --out->atomic;
-}
-
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
-{
- spin_lock(&j->lock);
- __bch2_journal_debug_to_text(out, j);
- spin_unlock(&j->lock);
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
deleted file mode 100644
index 977907038d98..000000000000
--- a/fs/bcachefs/journal.h
+++ /dev/null
@@ -1,465 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_H
-#define _BCACHEFS_JOURNAL_H
-
-/*
- * THE JOURNAL:
- *
- * The primary purpose of the journal is to log updates (insertions) to the
- * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
- *
- * Without the journal, the b-tree is always internally consistent on
- * disk - and in fact, in the earliest incarnations bcache didn't have a journal
- * but did handle unclean shutdowns by doing all index updates synchronously
- * (with coalescing).
- *
- * Updates to interior nodes still happen synchronously and without the journal
- * (for simplicity) - this may change eventually but updates to interior nodes
- * are rare enough it's not a huge priority.
- *
- * This means the journal is relatively separate from the b-tree; it consists of
- * just a list of keys and journal replay consists of just redoing those
- * insertions in same order that they appear in the journal.
- *
- * PERSISTENCE:
- *
- * For synchronous updates (where we're waiting on the index update to hit
- * disk), the journal entry will be written out immediately (or as soon as
- * possible, if the write for the previous journal entry was still in flight).
- *
- * Synchronous updates are specified by passing a closure (@flush_cl) to
- * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will wait on the journal write to
- * complete (via closure_wait()).
- *
- * If the index update wasn't synchronous, the journal entry will be
- * written out after 10 ms have elapsed, by default (the delay_ms field
- * in struct journal).
- *
- * JOURNAL ENTRIES:
- *
- * A journal entry is variable size (struct jset), it's got a fixed length
- * header and then a variable number of struct jset_entry entries.
- *
- * Journal entries are identified by monotonically increasing 64 bit sequence
- * numbers - jset->seq; other places in the code refer to this sequence number.
- *
- * A jset_entry entry contains one or more bkeys (which is what gets inserted
- * into the b-tree). We need a container to indicate which b-tree the key is
- * for; also, the roots of the various b-trees are stored in jset_entry entries
- * (one for each b-tree) - this lets us add new b-tree types without changing
- * the on disk format.
- *
- * We also keep some things in the journal header that are logically part of the
- * superblock - all the things that are frequently updated. This is for future
- * bcache on raw flash support; the superblock (which will become another
- * journal) can't be moved or wear leveled, so it contains just enough
- * information to find the main journal, and the superblock only has to be
- * rewritten when we want to move/wear level the main journal.
- *
- * JOURNAL LAYOUT ON DISK:
- *
- * The journal is written to a ringbuffer of buckets (which is kept in the
- * superblock); the individual buckets are not necessarily contiguous on disk
- * which means that journal entries are not allowed to span buckets, but also
- * that we can resize the journal at runtime if desired (unimplemented).
- *
- * The journal buckets exist in the same pool as all the other buckets that are
- * managed by the allocator and garbage collection - garbage collection marks
- * the journal buckets as metadata buckets.
- *
- * OPEN/DIRTY JOURNAL ENTRIES:
- *
- * Open/dirty journal entries are journal entries that contain b-tree updates
- * that have not yet been written out to the b-tree on disk. We have to track
- * which journal entries are dirty, and we also have to avoid wrapping around
- * the journal and overwriting old but still dirty journal entries with new
- * journal entries.
- *
- * On disk, this is represented with the "last_seq" field of struct jset;
- * last_seq is the first sequence number that journal replay has to replay.
- *
- * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
- * journal_device->seq) of for each journal bucket, the highest sequence number
- * any journal entry it contains. Then, by comparing that against last_seq we
- * can determine whether that journal bucket contains dirty journal entries or
- * not.
- *
- * To track which journal entries are dirty, we maintain a fifo of refcounts
- * (where each entry corresponds to a specific sequence number) - when a ref
- * goes to 0, that journal entry is no longer dirty.
- *
- * Journalling of index updates is done at the same time as the b-tree itself is
- * being modified (see btree_insert_key()); when we add the key to the journal
- * the pending b-tree write takes a ref on the journal entry the key was added
- * to. If a pending b-tree write would need to take refs on multiple dirty
- * journal entries, it only keeps the ref on the oldest one (since a newer
- * journal entry will still be replayed if an older entry was dirty).
- *
- * JOURNAL FILLING UP:
- *
- * There are two ways the journal could fill up; either we could run out of
- * space to write to, or we could have too many open journal entries and run out
- * of room in the fifo of refcounts. Since those refcounts are decremented
- * without any locking we can't safely resize that fifo, so we handle it the
- * same way.
- *
- * If the journal fills up, we start flushing dirty btree nodes until we can
- * allocate space for a journal write again - preferentially flushing btree
- * nodes that are pinning the oldest journal entries first.
- */
-
-#include <linux/hash.h>
-
-#include "journal_types.h"
-
-struct bch_fs;
-
-static inline void journal_wake(struct journal *j)
-{
- wake_up(&j->wait);
- closure_wake_up(&j->async_wait);
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
- return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
- return atomic64_read(&j->seq);
-}
-
-static inline u64 journal_last_unwritten_seq(struct journal *j)
-{
- return j->seq_ondisk + 1;
-}
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- unsigned idx = (journal_cur_seq(j) &
- JOURNAL_BUF_MASK &
- ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
-
- return j->buf + idx;
-}
-
-static inline int journal_state_count(union journal_res_state s, int idx)
-{
- switch (idx) {
- case 0: return s.buf0_count;
- case 1: return s.buf1_count;
- case 2: return s.buf2_count;
- case 3: return s.buf3_count;
- }
- BUG();
-}
-
-static inline int journal_state_seq_count(struct journal *j,
- union journal_res_state s, u64 seq)
-{
- if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
- return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
- else
- return 0;
-}
-
-static inline void journal_state_inc(union journal_res_state *s)
-{
- s->buf0_count += s->idx == 0;
- s->buf1_count += s->idx == 1;
- s->buf2_count += s->idx == 2;
- s->buf3_count += s->idx == 3;
-}
-
-/*
- * Amount of space that will be taken up by some keys in the journal (i.e.
- * including the jset header)
- */
-static inline unsigned jset_u64s(unsigned u64s)
-{
- return u64s + sizeof(struct jset_entry) / sizeof(u64);
-}
-
-static inline int journal_entry_overhead(struct journal *j)
-{
- return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
-{
- struct jset *jset = buf->data;
- struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
-
- memset(entry, 0, sizeof(*entry));
- entry->u64s = cpu_to_le16(u64s);
-
- le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-
- return entry;
-}
-
-static inline struct jset_entry *
-journal_res_entry(struct journal *j, struct journal_res *res)
-{
- return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
-}
-
-static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
- enum btree_id id, unsigned level,
- unsigned u64s)
-{
- entry->u64s = cpu_to_le16(u64s);
- entry->btree_id = id;
- entry->level = level;
- entry->type = type;
- entry->pad[0] = 0;
- entry->pad[1] = 0;
- entry->pad[2] = 0;
- return jset_u64s(u64s);
-}
-
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
- enum btree_id id, unsigned level,
- const void *data, unsigned u64s)
-{
- unsigned ret = journal_entry_init(entry, type, id, level, u64s);
-
- memcpy_u64s_small(entry->_data, data, u64s);
- return ret;
-}
-
-static inline struct jset_entry *
-bch2_journal_add_entry(struct journal *j, struct journal_res *res,
- unsigned type, enum btree_id id,
- unsigned level, unsigned u64s)
-{
- struct jset_entry *entry = journal_res_entry(j, res);
- unsigned actual = journal_entry_init(entry, type, id, level, u64s);
-
- EBUG_ON(!res->ref);
- EBUG_ON(actual > res->u64s);
-
- res->offset += actual;
- res->u64s -= actual;
- return entry;
-}
-
-static inline bool journal_entry_empty(struct jset *j)
-{
- if (j->seq != j->last_seq)
- return false;
-
- vstruct_for_each(j, i)
- if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
- return false;
- return true;
-}
-
-/*
- * Drop reference on a buffer index and return true if the count has hit zero.
- */
-static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
-{
- union journal_res_state s;
-
- s.v = atomic64_sub_return(((union journal_res_state) {
- .buf0_count = idx == 0,
- .buf1_count = idx == 1,
- .buf2_count = idx == 2,
- .buf3_count = idx == 3,
- }).v, &j->reservations.counter);
- return s;
-}
-
-bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_do_writes(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64);
-
-static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
-{
- unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
- union journal_res_state s;
-
- s = journal_state_buf_put(j, idx);
- if (!journal_state_count(s, idx))
- bch2_journal_buf_put_final(j, seq);
-}
-
-static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
-{
- unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
- union journal_res_state s;
-
- s = journal_state_buf_put(j, idx);
- if (!journal_state_count(s, idx)) {
- spin_lock(&j->lock);
- bch2_journal_buf_put_final(j, seq);
- spin_unlock(&j->lock);
- } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL))
- wake_up(&j->wait);
-}
-
-/*
- * This function releases the journal write structure so other threads can
- * then proceed to add their keys as well.
- */
-static inline void bch2_journal_res_put(struct journal *j,
- struct journal_res *res)
-{
- if (!res->ref)
- return;
-
- lock_release(&j->res_map, _THIS_IP_);
-
- while (res->u64s)
- bch2_journal_add_entry(j, res,
- BCH_JSET_ENTRY_btree_keys,
- 0, 0, 0);
-
- bch2_journal_buf_put(j, res->seq);
-
- res->ref = 0;
-}
-
-int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
- unsigned, struct btree_trans *);
-
-/* First bits for BCH_WATERMARK: */
-enum journal_res_flags {
- __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS,
- __JOURNAL_RES_GET_CHECK,
-};
-
-#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK)
-#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK)
-
-static inline int journal_res_get_fast(struct journal *j,
- struct journal_res *res,
- unsigned flags)
-{
- union journal_res_state old, new;
-
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
-
- /*
- * Check if there is still room in the current journal
- * entry, smp_rmb() guarantees that reads from reservations.counter
- * occur before accessing cur_entry_u64s:
- */
- smp_rmb();
- if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
- return 0;
-
- EBUG_ON(!journal_state_count(new, new.idx));
-
- if ((flags & BCH_WATERMARK_MASK) < j->watermark)
- return 0;
-
- new.cur_entry_offset += res->u64s;
- journal_state_inc(&new);
-
- /*
- * If the refcount would overflow, we have to wait:
- * XXX - tracepoint this:
- */
- if (!journal_state_count(new, new.idx))
- return 0;
-
- if (flags & JOURNAL_RES_GET_CHECK)
- return 1;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
- res->ref = true;
- res->offset = old.cur_entry_offset;
- res->seq = journal_cur_seq(j);
- res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
- return 1;
-}
-
-static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
- unsigned u64s, unsigned flags,
- struct btree_trans *trans)
-{
- int ret;
-
- EBUG_ON(res->ref);
- EBUG_ON(!test_bit(JOURNAL_running, &j->flags));
-
- res->u64s = u64s;
-
- if (journal_res_get_fast(j, res, flags))
- goto out;
-
- ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
- if (ret)
- return ret;
-out:
- if (!(flags & JOURNAL_RES_GET_CHECK)) {
- lock_acquire_shared(&j->res_map, 0,
- (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
- NULL, _THIS_IP_);
- EBUG_ON(!res->ref);
- BUG_ON(!res->seq);
- }
- return 0;
-}
-
-/* journal_entry_res: */
-
-void bch2_journal_entry_res_resize(struct journal *,
- struct journal_entry_res *,
- unsigned);
-
-int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
-void bch2_journal_flush_async(struct journal *, struct closure *);
-
-int bch2_journal_flush_seq(struct journal *, u64, unsigned);
-int bch2_journal_flush(struct journal *);
-bool bch2_journal_noflush_seq(struct journal *, u64, u64);
-int bch2_journal_meta(struct journal *);
-
-void bch2_journal_halt_locked(struct journal *);
-void bch2_journal_halt(struct journal *);
-
-static inline int bch2_journal_error(struct journal *j)
-{
- return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
- ? -BCH_ERR_journal_shutdown : 0;
-}
-
-struct bch_dev;
-
-void bch2_journal_unblock(struct journal *);
-void bch2_journal_block(struct journal *);
-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
-
-void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
-int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
-
-int bch2_dev_journal_alloc(struct bch_dev *, bool);
-int bch2_fs_journal_alloc(struct bch_fs *);
-
-void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
-
-void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, u64);
-void bch2_journal_set_replay_done(struct journal *);
-
-void bch2_dev_journal_exit(struct bch_dev *);
-int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
-void bch2_fs_journal_exit(struct journal *);
-void bch2_fs_journal_init_early(struct journal *);
-int bch2_fs_journal_init(struct journal *);
-
-#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
deleted file mode 100644
index 9e028dbcc3d0..000000000000
--- a/fs/bcachefs/journal_io.c
+++ /dev/null
@@ -1,2242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_io.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/string_choices.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
-{
- lockdep_assert_held(&c->sb_lock);
-
- for_each_member_device(c, ca) {
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
- m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
- }
-}
-
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
-{
- mutex_lock(&c->sb_lock);
- for_each_member_device(c, ca) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
-
- unsigned idx = le32_to_cpu(m.last_journal_bucket);
- if (idx < ca->journal.nr)
- ca->journal.cur_idx = idx;
- unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
- if (offset <= ca->mi.bucket_size)
- ca->journal.sectors_free = ca->mi.bucket_size - offset;
- }
- mutex_unlock(&c->sb_lock);
-}
-
-static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
-{
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
- prt_printf(out, "%s %u:%u:%u (sector %llu)",
- ca ? ca->name : "(invalid dev)",
- p->dev, p->bucket, p->bucket_offset, p->sector);
- bch2_dev_put(ca);
-}
-
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
-{
- darray_for_each(j->ptrs, i) {
- if (i != j->ptrs.data)
- prt_printf(out, " ");
- bch2_journal_ptr_to_text(out, c, i);
- }
-}
-
-static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
-{
- for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
- struct jset_entry_datetime *datetime =
- container_of(entry, struct jset_entry_datetime, entry);
- bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
- break;
- }
-}
-
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
-{
- prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
- bch2_journal_datetime_to_text(out, &j->j);
- prt_char(out, ' ');
- bch2_journal_ptrs_to_text(out, c, j);
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = ((__le32 *) &jset->seq)[0],
- [2] = ((__le32 *) &jset->seq)[1],
- [3] = BCH_NONCE_JOURNAL,
- }};
-}
-
-static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
-{
- if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
- *csum = (struct bch_csum) {};
- return false;
- }
-
- *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
- return !bch2_crc_cmp(j->csum, *csum);
-}
-
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
- return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
-static void __journal_replay_free(struct bch_fs *c,
- struct journal_replay *i)
-{
- struct journal_replay **p =
- genradix_ptr(&c->journal_entries,
- journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
-
- BUG_ON(*p != i);
- *p = NULL;
- kvfree(i);
-}
-
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
-{
- if (blacklisted)
- i->ignore_blacklisted = true;
- else
- i->ignore_not_dirty = true;
-
- if (!c->opts.read_entire_journal)
- __journal_replay_free(c, i);
-}
-
-struct journal_list {
- struct closure cl;
- u64 last_seq;
- struct mutex lock;
- int ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK 0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct journal_ptr entry_ptr,
- struct journal_list *jlist, struct jset *j)
-{
- struct genradix_iter iter;
- struct journal_replay **_i, *i, *dup;
- size_t bytes = vstruct_bytes(j);
- u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
- struct printbuf buf = PRINTBUF;
- int ret = JOURNAL_ENTRY_ADD_OK;
-
- if (last_seq && c->opts.journal_rewind)
- last_seq = min(last_seq, c->opts.journal_rewind);
-
- if (!c->journal.oldest_seq_found_ondisk ||
- le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
- c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
-
- /* Is this entry older than the range we need? */
- if (!c->opts.read_entire_journal &&
- le64_to_cpu(j->seq) < jlist->last_seq)
- return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-
- /*
- * genradixes are indexed by a ulong, not a u64, so we can't index them
- * by sequence number directly: Assume instead that they will all fall
- * within the range of +-2billion of the filrst one we find.
- */
- if (!c->journal_entries_base_seq)
- c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
-
- /* Drop entries we don't need anymore */
- if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
- genradix_for_each_from(&c->journal_entries, iter, _i,
- journal_entry_radix_idx(c, jlist->last_seq)) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- if (le64_to_cpu(i->j.seq) >= last_seq)
- break;
-
- journal_replay_free(c, i, false);
- }
- }
-
- jlist->last_seq = max(jlist->last_seq, last_seq);
-
- _i = genradix_ptr_alloc(&c->journal_entries,
- journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
- GFP_KERNEL);
- if (!_i)
- return bch_err_throw(c, ENOMEM_journal_entry_add);
-
- /*
- * Duplicate journal entries? If so we want the one that didn't have a
- * checksum error:
- */
- dup = *_i;
- if (dup) {
- bool identical = bytes == vstruct_bytes(&dup->j) &&
- !memcmp(j, &dup->j, bytes);
- bool not_identical = !identical &&
- entry_ptr.csum_good &&
- dup->csum_good;
-
- bool same_device = false;
- darray_for_each(dup->ptrs, ptr)
- if (ptr->dev == ca->dev_idx)
- same_device = true;
-
- ret = darray_push(&dup->ptrs, entry_ptr);
- if (ret)
- goto out;
-
- bch2_journal_replay_to_text(&buf, c, dup);
-
- fsck_err_on(same_device,
- c, journal_entry_dup_same_device,
- "duplicate journal entry on same device\n%s",
- buf.buf);
-
- fsck_err_on(not_identical,
- c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries\n%s",
- buf.buf);
-
- if (entry_ptr.csum_good && !identical)
- goto replace;
-
- goto out;
- }
-replace:
- i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i)
- return bch_err_throw(c, ENOMEM_journal_entry_add);
-
- darray_init(&i->ptrs);
- i->csum_good = entry_ptr.csum_good;
- i->ignore_blacklisted = false;
- i->ignore_not_dirty = false;
- unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-
- if (dup) {
- /* The first ptr should represent the jset we kept: */
- darray_for_each(dup->ptrs, ptr)
- darray_push(&i->ptrs, *ptr);
- __journal_replay_free(c, dup);
- } else {
- darray_push(&i->ptrs, entry_ptr);
- }
-
- *_i = i;
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/* this fills in a range with empty jset_entries: */
-static void journal_entry_null_range(void *start, void *end)
-{
- struct jset_entry *entry;
-
- for (entry = start; entry != end; entry = vstruct_next(entry))
- memset(entry, 0, sizeof(*entry));
-}
-
-#define JOURNAL_ENTRY_REREAD 5
-#define JOURNAL_ENTRY_NONE 6
-#define JOURNAL_ENTRY_BAD 7
-
-static void journal_entry_err_msg(struct printbuf *out,
- u32 version,
- struct jset *jset,
- struct jset_entry *entry)
-{
- prt_str(out, "invalid journal entry, version=");
- bch2_version_to_text(out, version);
-
- if (entry) {
- prt_str(out, " type=");
- bch2_prt_jset_entry_type(out, entry->type);
- }
-
- if (!jset) {
- prt_printf(out, " in superblock");
- } else {
-
- prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
-
- if (entry)
- prt_printf(out, " offset=%zi/%u",
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s));
- }
-
- prt_str(out, ": ");
-}
-
-#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
-({ \
- struct printbuf _buf = PRINTBUF; \
- \
- journal_entry_err_msg(&_buf, version, jset, entry); \
- prt_printf(&_buf, msg, ##__VA_ARGS__); \
- \
- switch (from.flags & BCH_VALIDATE_write) { \
- case READ: \
- mustfix_fsck_err(c, _err, "%s", _buf.buf); \
- break; \
- case WRITE: \
- bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
- if (bch2_fs_inconsistent(c, \
- "corrupt metadata before write: %s\n", _buf.buf)) {\
- ret = bch_err_throw(c, fsck_errors_not_fixed); \
- goto fsck_err; \
- } \
- break; \
- } \
- \
- printbuf_exit(&_buf); \
- true; \
-})
-
-#define journal_entry_err_on(cond, ...) \
- ((cond) ? journal_entry_err(__VA_ARGS__) : false)
-
-#define FSCK_DELETED_KEY 5
-
-static int journal_validate_key(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- struct bkey_i *k,
- struct bkey_validate_context from,
- unsigned version, int big_endian)
-{
- enum bch_validate_flags flags = from.flags;
- int write = flags & BCH_VALIDATE_write;
- void *next = vstruct_next(entry);
- int ret = 0;
-
- if (journal_entry_err_on(!k->k.u64s,
- c, version, jset, entry,
- journal_entry_bkey_u64s_0,
- "k->u64s 0")) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
-
- if (journal_entry_err_on((void *) bkey_next(k) >
- (void *) vstruct_next(entry),
- c, version, jset, entry,
- journal_entry_bkey_past_end,
- "extends past end of journal entry")) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
-
- if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
- c, version, jset, entry,
- journal_entry_bkey_bad_format,
- "bad format %u", k->k.format)) {
- le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
-
- if (!write)
- bch2_bkey_compat(from.level, from.btree, version, big_endian,
- write, NULL, bkey_to_packed(k));
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
- if (ret == -BCH_ERR_fsck_delete_bkey) {
- le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return FSCK_DELETED_KEY;
- }
- if (ret)
- goto fsck_err;
-
- if (write)
- bch2_bkey_compat(from.level, from.btree, version, big_endian,
- write, NULL, bkey_to_packed(k));
-fsck_err:
- return ret;
-}
-
-static int journal_entry_btree_keys_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct bkey_i *k = entry->start;
-
- from.level = entry->level;
- from.btree = entry->btree_id;
-
- while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
- if (ret == FSCK_DELETED_KEY)
- continue;
- else if (ret)
- return ret;
-
- k = bkey_next(k);
- }
-
- return 0;
-}
-
-static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- bool first = true;
-
- jset_entry_for_each_key(entry, k) {
- /* We may be called on entries that haven't been validated: */
- if (!k->k.u64s)
- break;
-
- if (!first) {
- prt_newline(out);
- bch2_prt_jset_entry_type(out, entry->type);
- prt_str(out, ": ");
- }
- bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
- prt_char(out, ' ');
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
- first = false;
- }
-}
-
-static int journal_entry_btree_root_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct bkey_i *k = entry->start;
- int ret = 0;
-
- from.root = true;
- from.level = entry->level + 1;
- from.btree = entry->btree_id;
-
- if (journal_entry_err_on(!entry->u64s ||
- le16_to_cpu(entry->u64s) != k->k.u64s,
- c, version, jset, entry,
- journal_entry_btree_root_bad_size,
- "invalid btree root journal entry: wrong number of keys")) {
- void *next = vstruct_next(entry);
- /*
- * we don't want to null out this jset_entry,
- * just the contents, so that later we can tell
- * we were _supposed_ to have a btree root
- */
- entry->u64s = 0;
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
- if (ret == FSCK_DELETED_KEY)
- ret = 0;
-fsck_err:
- return ret;
-}
-
-static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- /* obsolete, don't care: */
- return 0;
-}
-
-static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
-}
-
-static int journal_entry_blacklist_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
- c, version, jset, entry,
- journal_entry_blacklist_bad_size,
- "invalid journal seq blacklist entry: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- }
-fsck_err:
- return ret;
-}
-
-static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_blacklist *bl =
- container_of(entry, struct jset_entry_blacklist, entry);
-
- prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
-}
-
-static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_blacklist_v2 *bl_entry;
- int ret = 0;
-
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
- c, version, jset, entry,
- journal_entry_blacklist_v2_bad_size,
- "invalid journal seq blacklist entry: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- goto out;
- }
-
- bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
-
- if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
- le64_to_cpu(bl_entry->end),
- c, version, jset, entry,
- journal_entry_blacklist_v2_start_past_end,
- "invalid journal seq blacklist entry: start > end")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- }
-out:
-fsck_err:
- return ret;
-}
-
-static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_blacklist_v2 *bl =
- container_of(entry, struct jset_entry_blacklist_v2, entry);
-
- prt_printf(out, "start=%llu end=%llu",
- le64_to_cpu(bl->start),
- le64_to_cpu(bl->end));
-}
-
-static int journal_entry_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- int ret = 0;
-
- if (journal_entry_err_on(bytes < sizeof(*u),
- c, version, jset, entry,
- journal_entry_usage_bad_size,
- "invalid journal entry usage: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
-fsck_err:
- return ret;
-}
-
-static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
-
- prt_str(out, "type=");
- bch2_prt_fs_usage_type(out, u->entry.btree_id);
- prt_printf(out, " v=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_data_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- struct printbuf err = PRINTBUF;
- int ret = 0;
-
- if (journal_entry_err_on(bytes < sizeof(*u) ||
- bytes < sizeof(*u) + u->r.nr_devs,
- c, version, jset, entry,
- journal_entry_data_usage_bad_size,
- "invalid journal entry usage: bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- goto out;
- }
-
- if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
- c, version, jset, entry,
- journal_entry_data_usage_bad_size,
- "invalid journal entry usage: %s", err.buf)) {
- journal_entry_null_range(entry, vstruct_next(entry));
- goto out;
- }
-out:
-fsck_err:
- printbuf_exit(&err);
- return ret;
-}
-
-static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
-
- bch2_replicas_entry_to_text(out, &u->r);
- prt_printf(out, "=%llu", le64_to_cpu(u->v));
-}
-
-static int journal_entry_clock_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_clock *clock =
- container_of(entry, struct jset_entry_clock, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- int ret = 0;
-
- if (journal_entry_err_on(bytes != sizeof(*clock),
- c, version, jset, entry,
- journal_entry_clock_bad_size,
- "bad size")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
- if (journal_entry_err_on(clock->rw > 1,
- c, version, jset, entry,
- journal_entry_clock_bad_rw,
- "bad rw")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
-fsck_err:
- return ret;
-}
-
-static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_clock *clock =
- container_of(entry, struct jset_entry_clock, entry);
-
- prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
-}
-
-static int journal_entry_dev_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- struct jset_entry_dev_usage *u =
- container_of(entry, struct jset_entry_dev_usage, entry);
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- unsigned expected = sizeof(*u);
- int ret = 0;
-
- if (journal_entry_err_on(bytes < expected,
- c, version, jset, entry,
- journal_entry_dev_usage_bad_size,
- "bad size (%u < %u)",
- bytes, expected)) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
- if (journal_entry_err_on(u->pad,
- c, version, jset, entry,
- journal_entry_dev_usage_bad_pad,
- "bad pad")) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-
-fsck_err:
- return ret;
-}
-
-static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_dev_usage *u =
- container_of(entry, struct jset_entry_dev_usage, entry);
- unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
-
- if (vstruct_bytes(entry) < sizeof(*u))
- return;
-
- prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
-
- printbuf_indent_add(out, 2);
- for (i = 0; i < nr_types; i++) {
- prt_newline(out);
- bch2_prt_data_type(out, i);
- prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
- le64_to_cpu(u->d[i].buckets),
- le64_to_cpu(u->d[i].sectors),
- le64_to_cpu(u->d[i].fragmented));
- }
- printbuf_indent_sub(out, 2);
-}
-
-static int journal_entry_log_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
-
- prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
-}
-
-static int journal_entry_overwrite_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- from.flags = 0;
- return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, from);
-}
-
-static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_log_bkey_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- from.flags = 0;
- return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, from);
-}
-
-static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- return journal_entry_btree_keys_validate(c, jset, entry,
- version, big_endian, from);
-}
-
-static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- journal_entry_btree_keys_to_text(out, c, entry);
-}
-
-static int journal_entry_datetime_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- unsigned bytes = vstruct_bytes(entry);
- unsigned expected = 16;
- int ret = 0;
-
- if (journal_entry_err_on(vstruct_bytes(entry) < expected,
- c, version, jset, entry,
- journal_entry_dev_usage_bad_size,
- "bad size (%u < %u)",
- bytes, expected)) {
- journal_entry_null_range(entry, vstruct_next(entry));
- return ret;
- }
-fsck_err:
- return ret;
-}
-
-static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- struct jset_entry_datetime *datetime =
- container_of(entry, struct jset_entry_datetime, entry);
-
- bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
-}
-
-struct jset_entry_ops {
- int (*validate)(struct bch_fs *, struct jset *,
- struct jset_entry *, unsigned, int,
- struct bkey_validate_context);
- void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
-};
-
-static const struct jset_entry_ops bch2_jset_entry_ops[] = {
-#define x(f, nr) \
- [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
- .validate = journal_entry_##f##_validate, \
- .to_text = journal_entry_##f##_to_text, \
- },
- BCH_JSET_ENTRY_TYPES()
-#undef x
-};
-
-int bch2_journal_entry_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian,
- struct bkey_validate_context from)
-{
- return entry->type < BCH_JSET_ENTRY_NR
- ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
- version, big_endian, from)
- : 0;
-}
-
-void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
- struct jset_entry *entry)
-{
- bch2_prt_jset_entry_type(out, entry->type);
-
- if (entry->type < BCH_JSET_ENTRY_NR) {
- prt_str(out, ": ");
- bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
- }
-}
-
-static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
- enum bch_validate_flags flags)
-{
- struct bkey_validate_context from = {
- .flags = flags,
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(jset->seq),
- };
-
- unsigned version = le32_to_cpu(jset->version);
- int ret = 0;
-
- vstruct_for_each(jset, entry) {
- from.journal_offset = (u64 *) entry - jset->_data;
-
- if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
- c, version, jset, entry,
- journal_entry_past_jset_end,
- "journal entry extends past end of jset")) {
- jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
- break;
- }
-
- ret = bch2_journal_entry_validate(c, jset, entry, version,
- JSET_BIG_ENDIAN(jset), from);
- if (ret)
- break;
- }
-fsck_err:
- return ret;
-}
-
-static int jset_validate(struct bch_fs *c,
- struct bch_dev *ca,
- struct jset *jset, u64 sector,
- enum bch_validate_flags flags)
-{
- struct bkey_validate_context from = {
- .flags = flags,
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(jset->seq),
- };
- int ret = 0;
-
- if (le64_to_cpu(jset->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- unsigned version = le32_to_cpu(jset->version);
- if (journal_entry_err_on(!bch2_version_compatible(version),
- c, version, jset, NULL,
- jset_unsupported_version,
- "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq),
- BCH_VERSION_MAJOR(version),
- BCH_VERSION_MINOR(version))) {
- /* don't try to continue: */
- return -EINVAL;
- }
-
- if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
- c, version, jset, NULL,
- jset_unknown_csum,
- "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset)))
- ret = JOURNAL_ENTRY_BAD;
-
- /* last_seq is ignored when JSET_NO_FLUSH is true */
- if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
- le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
- c, version, jset, NULL,
- jset_last_seq_newer_than_seq,
- "invalid journal entry: last_seq > seq (%llu > %llu)",
- le64_to_cpu(jset->last_seq),
- le64_to_cpu(jset->seq))) {
- jset->last_seq = jset->seq;
- return JOURNAL_ENTRY_BAD;
- }
-
- ret = jset_validate_entries(c, jset, flags);
-fsck_err:
- return ret;
-}
-
-static int jset_validate_early(struct bch_fs *c,
- struct bch_dev *ca,
- struct jset *jset, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read)
-{
- struct bkey_validate_context from = {
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(jset->seq),
- };
- int ret = 0;
-
- if (le64_to_cpu(jset->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- unsigned version = le32_to_cpu(jset->version);
- if (journal_entry_err_on(!bch2_version_compatible(version),
- c, version, jset, NULL,
- jset_unsupported_version,
- "%s sector %llu seq %llu: unknown journal entry version %u.%u",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq),
- BCH_VERSION_MAJOR(version),
- BCH_VERSION_MINOR(version))) {
- /* don't try to continue: */
- return -EINVAL;
- }
-
- size_t bytes = vstruct_bytes(jset);
- if (bytes > (sectors_read << 9) &&
- sectors_read < bucket_sectors_left)
- return JOURNAL_ENTRY_REREAD;
-
- if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
- c, version, jset, NULL,
- jset_past_bucket_end,
- "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq), bytes))
- le32_add_cpu(&jset->u64s,
- -((bytes - (bucket_sectors_left << 9)) / 8));
-fsck_err:
- return ret;
-}
-
-struct journal_read_buf {
- void *data;
- size_t size;
-};
-
-static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
- size_t new_size)
-{
- void *n;
-
- /* the bios are sized for this many pages, max: */
- if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
-
- new_size = roundup_pow_of_two(new_size);
- n = kvmalloc(new_size, GFP_KERNEL);
- if (!n)
- return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
-
- kvfree(b->data);
- b->data = n;
- b->size = new_size;
- return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
- struct journal_read_buf *buf,
- struct journal_list *jlist,
- unsigned bucket)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct jset *j = NULL;
- unsigned sectors, sectors_read = 0;
- u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
- end = offset + ca->mi.bucket_size;
- bool saw_bad = false, csum_good;
- int ret = 0;
-
- pr_debug("reading %u", bucket);
-
- while (offset < end) {
- if (!sectors_read) {
- struct bio *bio;
- unsigned nr_bvecs;
-reread:
- sectors_read = min_t(unsigned,
- end - offset, buf->size >> 9);
- nr_bvecs = buf_pages(buf->data, sectors_read << 9);
-
- bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!bio)
- return bch_err_throw(c, ENOMEM_journal_read_bucket);
- bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
-
- bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, buf->data, sectors_read << 9);
-
- u64 submit_time = local_clock();
- ret = submit_bio_wait(bio);
- kfree(bio);
-
- if (!ret && bch2_meta_read_fault("journal"))
- ret = bch_err_throw(c, EIO_fault_injected);
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
- submit_time, !ret);
-
- if (ret) {
- bch_err_dev_ratelimited(ca,
- "journal read error: sector %llu", offset);
- /*
- * We don't error out of the recovery process
- * here, since the relevant journal entry may be
- * found on a different device, and missing or
- * no journal entries will be handled later
- */
- return 0;
- }
-
- j = buf->data;
- }
-
- ret = jset_validate_early(c, ca, j, offset,
- end - offset, sectors_read);
- switch (ret) {
- case 0:
- sectors = vstruct_sectors(j, c->block_bits);
- break;
- case JOURNAL_ENTRY_REREAD:
- if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(c, buf,
- vstruct_bytes(j));
- if (ret)
- return ret;
- }
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- return 0;
- /*
- * On checksum error we don't really trust the size
- * field of the journal entry we read, so try reading
- * again at next block boundary:
- */
- sectors = block_sectors(c);
- goto next_block;
- default:
- return ret;
- }
-
- if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
- ja->highest_seq_found = le64_to_cpu(j->seq);
- ja->cur_idx = bucket;
- ja->sectors_free = ca->mi.bucket_size -
- bucket_remainder(ca, offset) - sectors;
- }
-
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- struct bch_csum csum;
- csum_good = jset_csum_good(c, j, &csum);
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
-
- if (!csum_good) {
- /*
- * Don't print an error here, we'll print the error
- * later if we need this journal entry
- */
- saw_bad = true;
- }
-
- ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
- j->encrypted_start,
- vstruct_end(j) - (void *) j->encrypted_start);
- bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
-
- mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, (struct journal_ptr) {
- .csum_good = csum_good,
- .csum = csum,
- .dev = ca->dev_idx,
- .bucket = bucket,
- .bucket_offset = offset -
- bucket_to_sector(ca, ja->buckets[bucket]),
- .sector = offset,
- }, jlist, j);
- mutex_unlock(&jlist->lock);
-
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- return ret;
- }
-next_block:
- pr_debug("next");
- offset += sectors;
- sectors_read -= sectors;
- j = ((void *) j) + (sectors << 9);
- }
-
- return 0;
-}
-
-static CLOSURE_CALLBACK(bch2_journal_read_device)
-{
- closure_type(ja, struct journal_device, read);
- struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
- struct bch_fs *c = ca->fs;
- struct journal_list *jlist =
- container_of(cl->parent, struct journal_list, cl);
- struct journal_read_buf buf = { NULL, 0 };
- unsigned i;
- int ret = 0;
-
- if (!ja->nr)
- goto out;
-
- ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
- if (ret)
- goto err;
-
- pr_debug("%u journal buckets", ja->nr);
-
- for (i = 0; i < ja->nr; i++) {
- ret = journal_read_bucket(ca, &buf, jlist, i);
- if (ret)
- goto err;
- }
-
- /*
- * Set dirty_idx to indicate the entire journal is full and needs to be
- * reclaimed - journal reclaim will immediately reclaim whatever isn't
- * pinned when it first runs:
- */
- ja->discard_idx = ja->dirty_idx_ondisk =
- ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
-out:
- bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
- kvfree(buf.data);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
- closure_return(cl);
- return;
-err:
- mutex_lock(&jlist->lock);
- jlist->ret = ret;
- mutex_unlock(&jlist->lock);
- goto out;
-}
-
-noinline_for_stack
-static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
-{
- struct printbuf buf = PRINTBUF;
- enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
- bool have_good = false;
-
- prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
- bch2_journal_datetime_to_text(&buf, &j->j);
- prt_newline(&buf);
-
- darray_for_each(j->ptrs, ptr)
- if (!ptr->csum_good) {
- bch2_journal_ptr_to_text(&buf, c, ptr);
- prt_char(&buf, ' ');
- bch2_csum_to_text(&buf, csum_type, ptr->csum);
- prt_newline(&buf);
- } else {
- have_good = true;
- }
-
- prt_printf(&buf, "should be ");
- bch2_csum_to_text(&buf, csum_type, j->j.csum);
-
- if (have_good)
- prt_printf(&buf, "\n(had good copy on another device)");
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-}
-
-noinline_for_stack
-static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
-{
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- struct genradix_iter radix_iter;
- struct journal_replay *i, **_i, *prev = NULL;
- u64 seq = start_seq;
-
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- BUG_ON(seq > le64_to_cpu(i->j.seq));
-
- while (seq < le64_to_cpu(i->j.seq)) {
- while (seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (seq == le64_to_cpu(i->j.seq))
- break;
-
- u64 missing_start = seq;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- !bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- u64 missing_end = seq - 1;
-
- printbuf_reset(&buf);
- prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- missing_start, missing_end,
- start_seq, end_seq);
-
- prt_printf(&buf, "\nprev at ");
- if (prev) {
- bch2_journal_ptrs_to_text(&buf, c, prev);
- prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
- } else
- prt_printf(&buf, "(none)");
-
- prt_printf(&buf, "\nnext at ");
- bch2_journal_ptrs_to_text(&buf, c, i);
- prt_printf(&buf, ", continue?");
-
- fsck_err(c, journal_entries_missing, "%s", buf.buf);
- }
-
- prev = i;
- seq++;
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_journal_read(struct bch_fs *c,
- u64 *last_seq,
- u64 *blacklist_seq,
- u64 *start_seq)
-{
- struct journal_list jlist;
- struct journal_replay *i, **_i;
- struct genradix_iter radix_iter;
- struct printbuf buf = PRINTBUF;
- bool degraded = false, last_write_torn = false;
- u64 seq;
- int ret = 0;
-
- closure_init_stack(&jlist.cl);
- mutex_init(&jlist.lock);
- jlist.last_seq = 0;
- jlist.ret = 0;
-
- for_each_member_device(c, ca) {
- if (!c->opts.fsck &&
- !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
- continue;
-
- if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
- ca->mi.state == BCH_MEMBER_STATE_ro) &&
- enumerated_ref_tryget(&ca->io_ref[READ],
- BCH_DEV_READ_REF_journal_read))
- closure_call(&ca->journal.read,
- bch2_journal_read_device,
- system_unbound_wq,
- &jlist.cl);
- else
- degraded = true;
- }
-
- while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
- ;
-
- if (jlist.ret)
- return jlist.ret;
-
- *last_seq = 0;
- *start_seq = 0;
- *blacklist_seq = 0;
-
- /*
- * Find most recent flush entry, and ignore newer non flush entries -
- * those entries will be blacklisted:
- */
- genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- if (!*start_seq)
- *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
-
- if (JSET_NO_FLUSH(&i->j)) {
- i->ignore_blacklisted = true;
- continue;
- }
-
- if (!last_write_torn && !i->csum_good) {
- last_write_torn = true;
- i->ignore_blacklisted = true;
- continue;
- }
-
- struct bkey_validate_context from = {
- .from = BKEY_VALIDATE_journal,
- .journal_seq = le64_to_cpu(i->j.seq),
- };
- if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
- c, le32_to_cpu(i->j.version), &i->j, NULL,
- jset_last_seq_newer_than_seq,
- "invalid journal entry: last_seq > seq (%llu > %llu)",
- le64_to_cpu(i->j.last_seq),
- le64_to_cpu(i->j.seq)))
- i->j.last_seq = i->j.seq;
-
- *last_seq = le64_to_cpu(i->j.last_seq);
- *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
- break;
- }
-
- if (!*start_seq) {
- bch_info(c, "journal read done, but no entries found");
- return 0;
- }
-
- if (!*last_seq) {
- fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
- "journal read done, but no entries found after dropping non-flushes");
- return 0;
- }
-
- printbuf_reset(&buf);
- prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
- *last_seq, *blacklist_seq - 1);
-
- /*
- * Drop blacklisted entries and entries older than last_seq (or start of
- * journal rewind:
- */
- u64 drop_before = *last_seq;
- if (c->opts.journal_rewind) {
- drop_before = min(drop_before, c->opts.journal_rewind);
- prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
- }
-
- *last_seq = drop_before;
- if (*start_seq != *blacklist_seq)
- prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
- bch_info(c, "%s", buf.buf);
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- seq = le64_to_cpu(i->j.seq);
- if (seq < drop_before) {
- journal_replay_free(c, i, false);
- continue;
- }
-
- if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
- fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
- jset_seq_blacklisted,
- "found blacklisted journal entry %llu", seq);
- i->ignore_blacklisted = true;
- }
- }
-
- ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1);
- if (ret)
- goto err;
-
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- union bch_replicas_padded replicas = {
- .e.data_type = BCH_DATA_journal,
- .e.nr_devs = 0,
- .e.nr_required = 1,
- };
-
- i = *_i;
- if (journal_replay_ignore(i))
- continue;
-
- /*
- * Don't print checksum errors until we know we're going to use
- * a given journal entry:
- */
- darray_for_each(i->ptrs, ptr)
- if (!ptr->csum_good) {
- bch2_journal_print_checksum_error(c, i);
- break;
- }
-
- ret = jset_validate(c,
- bch2_dev_have_ref(c, i->ptrs.data[0].dev),
- &i->j,
- i->ptrs.data[0].sector,
- READ);
- if (ret)
- goto err;
-
- darray_for_each(i->ptrs, ptr)
- replicas_entry_add_dev(&replicas.e, ptr->dev);
-
- bch2_replicas_entry_sort(&replicas.e);
-
- printbuf_reset(&buf);
- bch2_replicas_entry_to_text(&buf, &replicas.e);
-
- if (!degraded &&
- !bch2_replicas_marked(c, &replicas.e) &&
- (le64_to_cpu(i->j.seq) == *last_seq ||
- fsck_err(c, journal_entry_replicas_not_marked,
- "superblock not marked as containing replicas for journal entry %llu\n%s",
- le64_to_cpu(i->j.seq), buf.buf))) {
- ret = bch2_mark_replicas(c, &replicas.e);
- if (ret)
- goto err;
- }
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/* journal write: */
-
-static void journal_advance_devs_to_next_bucket(struct journal *j,
- struct dev_alloc_list *devs,
- unsigned sectors, __le64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- guard(rcu)();
- darray_for_each(*devs, i) {
- struct bch_dev *ca = rcu_dereference(c->devs[*i]);
- if (!ca)
- continue;
-
- struct journal_device *ja = &ca->journal;
-
- if (sectors > ja->sectors_free &&
- sectors <= ca->mi.bucket_size &&
- bch2_journal_dev_buckets_available(j, ja,
- journal_space_discarded)) {
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = ca->mi.bucket_size;
-
- /*
- * ja->bucket_seq[ja->cur_idx] must always have
- * something sensible:
- */
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
- }
- }
-}
-
-static void __journal_write_alloc(struct journal *j,
- struct journal_buf *w,
- struct dev_alloc_list *devs,
- unsigned sectors,
- unsigned *replicas,
- unsigned replicas_want)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- darray_for_each(*devs, i) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
- BCH_DEV_WRITE_REF_journal_write);
- if (!ca)
- continue;
-
- struct journal_device *ja = &ca->journal;
-
- /*
- * Check that we can use this device, and aren't already using
- * it:
- */
- if (!ca->mi.durability ||
- ca->mi.state != BCH_MEMBER_STATE_rw ||
- !ja->nr ||
- bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
- sectors > ja->sectors_free) {
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
- continue;
- }
-
- bch2_dev_stripe_increment(ca, &j->wp.stripe);
-
- bch2_bkey_append_ptr(&w->key,
- (struct bch_extent_ptr) {
- .offset = bucket_to_sector(ca,
- ja->buckets[ja->cur_idx]) +
- ca->mi.bucket_size -
- ja->sectors_free,
- .dev = ca->dev_idx,
- });
-
- ja->sectors_free -= sectors;
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
- *replicas += ca->mi.durability;
-
- if (*replicas >= replicas_want)
- break;
- }
-}
-
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
- unsigned *replicas)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_devs_mask devs;
- struct dev_alloc_list devs_sorted;
- unsigned sectors = vstruct_sectors(w->data, c->block_bits);
- unsigned target = c->opts.metadata_target ?:
- c->opts.foreground_target;
- unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
- unsigned replicas_need = min_t(unsigned, replicas_want,
- READ_ONCE(c->opts.metadata_replicas_required));
- bool advance_done = false;
-
-retry_target:
- devs = target_rw_devs(c, BCH_DATA_journal, target);
- bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
-retry_alloc:
- __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
-
- if (likely(*replicas >= replicas_want))
- goto done;
-
- if (!advance_done) {
- journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
- advance_done = true;
- goto retry_alloc;
- }
-
- if (*replicas < replicas_want && target) {
- /* Retry from all devices: */
- target = 0;
- advance_done = false;
- goto retry_target;
- }
-done:
- BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
-
-#if 0
- /*
- * XXX: we need a way to alert the user when we go degraded for any
- * reason
- */
- if (*replicas < min(replicas_want,
- dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
- }
-#endif
-
- return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
-}
-
-static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- /* we aren't holding j->lock: */
- unsigned new_size = READ_ONCE(j->buf_size_want);
- void *new_buf;
-
- if (buf->buf_size >= new_size)
- return;
-
- size_t btree_write_buffer_size = new_size / 64;
-
- if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
- return;
-
- new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
- if (!new_buf)
- return;
-
- memcpy(new_buf, buf->data, buf->buf_size);
-
- spin_lock(&j->lock);
- swap(buf->data, new_buf);
- swap(buf->buf_size, new_size);
- spin_unlock(&j->lock);
-
- kvfree(new_buf);
-}
-
-static CLOSURE_CALLBACK(journal_write_done)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union bch_replicas_padded replicas;
- u64 seq = le64_to_cpu(w->data->seq);
- int err = 0;
-
- bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
- ? j->flush_write_time
- : j->noflush_write_time, j->write_start_time);
-
- if (!w->devs_written.nr) {
- err = bch_err_throw(c, journal_write_err);
- } else {
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- w->devs_written);
- err = bch2_mark_replicas(c, &replicas.e);
- }
-
- if (err && !bch2_journal_error(j)) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- if (err == -BCH_ERR_journal_write_err)
- prt_printf(&buf, "unable to write journal to sufficient devices\n");
- else
- prt_printf(&buf, "journal write error marking replicas: %s\n",
- bch2_err_str(err));
-
- bch2_fs_emergency_read_only2(c, &buf);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-
- closure_debug_destroy(cl);
-
- spin_lock(&j->lock);
- if (seq >= j->pin.front)
- journal_seq_pin(j, seq)->devs = w->devs_written;
- if (err && (!j->err_seq || seq < j->err_seq))
- j->err_seq = seq;
- w->write_done = true;
-
- if (!j->free_buf || j->free_buf_size < w->buf_size) {
- swap(j->free_buf, w->data);
- swap(j->free_buf_size, w->buf_size);
- }
-
- if (w->data) {
- void *buf = w->data;
- w->data = NULL;
- w->buf_size = 0;
-
- spin_unlock(&j->lock);
- kvfree(buf);
- spin_lock(&j->lock);
- }
-
- bool completed = false;
- bool do_discards = false;
-
- for (seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++) {
- w = j->buf + (seq & JOURNAL_BUF_MASK);
- if (!w->write_done)
- break;
-
- if (!j->err_seq && !w->noflush) {
- j->flushed_seq_ondisk = seq;
- j->last_seq_ondisk = w->last_seq;
-
- closure_wake_up(&c->freelist_wait);
- bch2_reset_alloc_cursors(c);
- do_discards = true;
- }
-
- j->seq_ondisk = seq;
-
- /*
- * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
- * more buckets:
- *
- * Must come before signaling write completion, for
- * bch2_fs_journal_stop():
- */
- if (j->watermark != BCH_WATERMARK_stripe)
- journal_reclaim_kick(&c->journal);
-
- closure_wake_up(&w->wait);
- completed = true;
- }
-
- if (completed) {
- bch2_journal_reclaim_fast(j);
- bch2_journal_space_available(j);
-
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
-
- journal_wake(j);
- }
-
- if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
- struct journal_buf *buf = journal_cur_buf(j);
- long delta = buf->expires - jiffies;
-
- /*
- * We don't close a journal entry to write it while there's
- * previous entries still in flight - the current journal entry
- * might want to be written now:
- */
- mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
- }
-
- /*
- * We don't typically trigger journal writes from her - the next journal
- * write will be triggered immediately after the previous one is
- * allocated, in bch2_journal_write() - but the journal write error path
- * is special:
- */
- bch2_journal_do_writes(j);
- spin_unlock(&j->lock);
-
- if (do_discards)
- bch2_do_discards(c);
-}
-
-static void journal_write_endio(struct bio *bio)
-{
- struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
- struct bch_dev *ca = jbio->ca;
- struct journal *j = &ca->fs->journal;
- struct journal_buf *w = j->buf + jbio->buf_idx;
-
- bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
- jbio->submit_time, !bio->bi_status);
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca,
- "error writing journal entry %llu: %s",
- le64_to_cpu(w->data->seq),
- bch2_blk_status_to_str(bio->bi_status));
-
- unsigned long flags;
- spin_lock_irqsave(&j->err_lock, flags);
- bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
- spin_unlock_irqrestore(&j->err_lock, flags);
- }
-
- closure_put(&w->io);
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
-}
-
-static CLOSURE_CALLBACK(journal_write_submit)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned sectors = vstruct_sectors(w->data, c->block_bits);
-
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
- sectors);
-
- struct journal_device *ja = &ca->journal;
- struct journal_bio *jbio = ja->bio[w->idx];
- struct bio *bio = &jbio->bio;
-
- jbio->submit_time = local_clock();
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
-
- BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
- ca->prev_journal_sector = bio->bi_iter.bi_sector;
-
- if (!JSET_NO_FLUSH(w->data))
- bio->bi_opf |= REQ_FUA;
- if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
- bio->bi_opf |= REQ_PREFLUSH;
-
- bch2_bio_map(bio, w->data, sectors << 9);
-
- trace_and_count(c, journal_write, bio);
- closure_bio_submit(bio, cl);
-
- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
- }
-
- continue_at(cl, journal_write_done, j->wq);
-}
-
-static CLOSURE_CALLBACK(journal_write_preflush)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- /*
- * Wait for previous journal writes to comelete; they won't necessarily
- * be flushed if they're still in flight
- */
- if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
- spin_lock(&j->lock);
- if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
- closure_wait(&j->async_wait, cl);
- spin_unlock(&j->lock);
- continue_at(cl, journal_write_preflush, j->wq);
- return;
- }
- spin_unlock(&j->lock);
- }
-
- if (w->separate_flush) {
- for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
- enumerated_ref_get(&ca->io_ref[WRITE],
- BCH_DEV_WRITE_REF_journal_write);
-
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
- bio_reset(bio, ca->disk_sb.bdev,
- REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
-
- continue_at(cl, journal_write_submit, j->wq);
- } else {
- /*
- * no need to punt to another work item if we're not waiting on
- * preflushes
- */
- journal_write_submit(&cl->work);
- }
-}
-
-static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *start, *end;
- struct jset *jset = w->data;
- struct journal_keys_to_wb wb = { NULL };
- unsigned u64s;
- unsigned long btree_roots_have = 0;
- u64 seq = le64_to_cpu(jset->seq);
- int ret;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each(jset, i) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /*
- * New btree roots are set by journalling them; when the journal
- * entry gets written we have to propagate them to
- * c->btree_roots
- *
- * But, every journal entry we write has to contain all the
- * btree roots (at least for now); so after we copy btree roots
- * to c->btree_roots we have to get any missing btree roots and
- * add them to this journal entry:
- */
- switch (i->type) {
- case BCH_JSET_ENTRY_btree_root:
- bch2_journal_entry_to_btree_root(c, i);
- __set_bit(i->btree_id, &btree_roots_have);
- break;
- case BCH_JSET_ENTRY_write_buffer_keys:
- EBUG_ON(!w->need_flush_to_write_buffer);
-
- if (!wb.wb)
- bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
-
- jset_entry_for_each_key(i, k) {
- ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
- if (ret) {
- bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
- bch2_err_str(ret));
- bch2_journal_keys_to_write_buffer_end(c, &wb);
- return ret;
- }
- }
- i->type = BCH_JSET_ENTRY_btree_keys;
- break;
- }
- }
-
- if (wb.wb) {
- ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
- if (ret) {
- bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
- bch2_err_str(ret));
- return ret;
- }
- }
-
- spin_lock(&c->journal.lock);
- w->need_flush_to_write_buffer = false;
- spin_unlock(&c->journal.lock);
-
- start = end = vstruct_last(jset);
-
- end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
-
- struct jset_entry_datetime *d =
- container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
- d->entry.type = BCH_JSET_ENTRY_datetime;
- d->seconds = cpu_to_le64(ktime_get_real_seconds());
-
- bch2_journal_super_entries_add_common(c, &end, seq);
- u64s = (u64 *) end - (u64 *) start;
-
- WARN_ON(u64s > j->entry_u64s_reserved);
-
- le32_add_cpu(&jset->u64s, u64s);
-
- unsigned sectors = vstruct_sectors(jset, c->block_bits);
-
- if (sectors > w->sectors) {
- bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
- vstruct_bytes(jset), w->sectors << 9,
- u64s, w->u64s_reserved, j->entry_u64s_reserved);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset *jset = w->data;
- u64 seq = le64_to_cpu(jset->seq);
- bool validate_before_checksum = false;
- int ret = 0;
-
- jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = cpu_to_le32(c->sb.version);
-
- SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
- SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
-
- if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = seq;
-
- if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
- validate_before_checksum = true;
-
- if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
- validate_before_checksum = true;
-
- if (validate_before_checksum &&
- (ret = jset_validate(c, NULL, jset, 0, WRITE)))
- return ret;
-
- ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret)))
- return ret;
-
- jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
- journal_nonce(jset), jset);
-
- if (!validate_before_checksum &&
- (ret = jset_validate(c, NULL, jset, 0, WRITE)))
- return ret;
-
- unsigned sectors = vstruct_sectors(jset, c->block_bits);
- unsigned bytes = vstruct_bytes(jset);
- memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
- return 0;
-}
-
-static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int error = bch2_journal_error(j);
-
- /*
- * If the journal is in an error state - we did an emergency shutdown -
- * we prefer to continue doing journal writes. We just mark them as
- * noflush so they'll never be used, but they'll still be visible by the
- * list_journal tool - this helps in debugging.
- *
- * There's a caveat: the first journal write after marking the
- * superblock dirty must always be a flush write, because on startup
- * from a clean shutdown we didn't necessarily read the journal and the
- * new journal write might overwrite whatever was in the journal
- * previously - we can't leave the journal without any flush writes in
- * it.
- *
- * So if we're in an error state, and we're still starting up, we don't
- * write anything at all.
- */
- if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
- return error;
-
- if (error ||
- w->noflush ||
- (!w->must_flush &&
- time_before(jiffies, j->last_flush_write +
- msecs_to_jiffies(c->opts.journal_flush_delay)) &&
- test_bit(JOURNAL_may_skip_flush, &j->flags))) {
- w->noflush = true;
- SET_JSET_NO_FLUSH(w->data, true);
- w->data->last_seq = 0;
- w->last_seq = 0;
-
- j->nr_noflush_writes++;
- } else {
- w->must_flush = true;
- j->last_flush_write = jiffies;
- j->nr_flush_writes++;
- clear_bit(JOURNAL_need_flush_write, &j->flags);
- }
-
- return 0;
-}
-
-CLOSURE_CALLBACK(bch2_journal_write)
-{
- closure_type(w, struct journal_buf, io);
- struct journal *j = container_of(w, struct journal, buf[w->idx]);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union bch_replicas_padded replicas;
- unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
- int ret;
-
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
- BUG_ON(!w->write_started);
- BUG_ON(w->write_allocated);
- BUG_ON(w->write_done);
-
- j->write_start_time = local_clock();
-
- spin_lock(&j->lock);
- if (nr_rw_members > 1)
- w->separate_flush = true;
-
- ret = bch2_journal_write_pick_flush(j, w);
- spin_unlock(&j->lock);
-
- if (unlikely(ret))
- goto err;
-
- mutex_lock(&j->buf_lock);
- journal_buf_realloc(j, w);
-
- ret = bch2_journal_write_prep(j, w);
- mutex_unlock(&j->buf_lock);
-
- if (unlikely(ret))
- goto err;
-
- unsigned replicas_allocated = 0;
- while (1) {
- ret = journal_write_alloc(j, w, &replicas_allocated);
- if (!ret || !j->can_discard)
- break;
-
- bch2_journal_do_discards(j);
- }
-
- if (unlikely(ret))
- goto err_allocate_write;
-
- ret = bch2_journal_write_checksum(j, w);
- if (unlikely(ret))
- goto err;
-
- spin_lock(&j->lock);
- /*
- * write is allocated, no longer need to account for it in
- * bch2_journal_space_available():
- */
- w->sectors = 0;
- w->write_allocated = true;
- j->entry_bytes_written += vstruct_bytes(w->data);
-
- /*
- * journal entry has been compacted and allocated, recalculate space
- * available:
- */
- bch2_journal_space_available(j);
- bch2_journal_do_writes(j);
- spin_unlock(&j->lock);
-
- w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
-
- /*
- * Mark journal replicas before we submit the write to guarantee
- * recovery will find the journal entries after a crash.
- */
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- w->devs_written);
- ret = bch2_mark_replicas(c, &replicas.e);
- if (ret)
- goto err;
-
- if (c->opts.nochanges)
- goto no_io;
-
- if (!JSET_NO_FLUSH(w->data))
- continue_at(cl, journal_write_preflush, j->wq);
- else
- continue_at(cl, journal_write_submit, j->wq);
- return;
-err_allocate_write:
- if (!bch2_journal_error(j)) {
- struct printbuf buf = PRINTBUF;
-
- bch2_journal_debug_to_text(&buf, j);
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
- le64_to_cpu(w->data->seq),
- vstruct_sectors(w->data, c->block_bits),
- bch2_err_str(ret));
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-err:
- bch2_fatal_error(c);
-no_io:
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
- }
-
- continue_at(cl, journal_write_done, j->wq);
-}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
deleted file mode 100644
index 6fa82c4050fe..000000000000
--- a/fs/bcachefs/journal_io.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_IO_H
-#define _BCACHEFS_JOURNAL_IO_H
-
-#include "darray.h"
-
-void bch2_journal_pos_from_member_info_set(struct bch_fs *);
-void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-
-struct journal_ptr {
- bool csum_good;
- struct bch_csum csum;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
- bool csum_good;
- bool ignore_blacklisted;
- bool ignore_not_dirty;
- /* must be last: */
- struct jset j;
-};
-
-static inline bool journal_replay_ignore(struct journal_replay *i)
-{
- return !i || i->ignore_blacklisted || i->ignore_not_dirty;
-}
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
- struct jset_entry *entry, unsigned type)
-{
- while (entry < vstruct_last(jset)) {
- if (entry->type == type)
- return entry;
-
- entry = vstruct_next(entry);
- }
-
- return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type) \
- for (struct jset_entry *entry = (jset)->start; \
- (entry = __jset_entry_type_next(jset, entry, type)); \
- entry = vstruct_next(entry))
-
-#define jset_entry_for_each_key(_e, _k) \
- for (struct bkey_i *_k = (_e)->start; \
- _k < vstruct_last(_e); \
- _k = bkey_next(_k))
-
-#define for_each_jset_key(k, entry, jset) \
- for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
- jset_entry_for_each_key(entry, k)
-
-int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
- struct jset_entry *, unsigned, int,
- struct bkey_validate_context);
-void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
- struct jset_entry *);
-
-void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
- struct journal_replay *);
-
-int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
-
-CLOSURE_CALLBACK(bch2_journal_write);
-
-static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
-#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
deleted file mode 100644
index 0042d43b8e57..000000000000
--- a/fs/bcachefs/journal_reclaim.c
+++ /dev/null
@@ -1,1037 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "trace.h"
-
-#include <linux/kthread.h>
-#include <linux/sched/mm.h>
-
-static bool __should_discard_bucket(struct journal *, struct journal_device *);
-
-/* Free space calculations: */
-
-static unsigned journal_space_from(struct journal_device *ja,
- enum journal_space_from from)
-{
- switch (from) {
- case journal_space_discarded:
- return ja->discard_idx;
- case journal_space_clean_ondisk:
- return ja->dirty_idx_ondisk;
- case journal_space_clean:
- return ja->dirty_idx;
- default:
- BUG();
- }
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *j,
- struct journal_device *ja,
- enum journal_space_from from)
-{
- if (!ja->nr)
- return 0;
-
- unsigned available = (journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr;
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
- --available;
-
- return available;
-}
-
-void bch2_journal_set_watermark(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool low_on_space = j->space[journal_space_clean].total * 4 <=
- j->space[journal_space_total].total;
- bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
- bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
- unsigned watermark = low_on_space || low_on_pin || low_on_wb
- ? BCH_WATERMARK_reclaim
- : BCH_WATERMARK_stripe;
-
- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
- trace_and_count(c, journal_full, c);
-
- mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
-
- swap(watermark, j->watermark);
- if (watermark > j->watermark)
- journal_wake(j);
-}
-
-static struct journal_space
-journal_dev_space_available(struct journal *j, struct bch_dev *ca,
- enum journal_space_from from)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_device *ja = &ca->journal;
- unsigned sectors, buckets, unwritten;
- unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
- u64 seq;
-
- if (from == journal_space_total)
- return (struct journal_space) {
- .next_entry = bucket_size_aligned,
- .total = bucket_size_aligned * ja->nr,
- };
-
- buckets = bch2_journal_dev_buckets_available(j, ja, from);
- sectors = round_down(ja->sectors_free, block_sectors(c));
-
- /*
- * We that we don't allocate the space for a journal entry
- * until we write it out - thus, account for it here:
- */
- for (seq = journal_last_unwritten_seq(j);
- seq <= journal_cur_seq(j);
- seq++) {
- unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
-
- if (!unwritten)
- continue;
-
- /* entry won't fit on this device, skip: */
- if (unwritten > bucket_size_aligned)
- continue;
-
- if (unwritten >= sectors) {
- if (!buckets) {
- sectors = 0;
- break;
- }
-
- buckets--;
- sectors = bucket_size_aligned;
- }
-
- sectors -= unwritten;
- }
-
- if (sectors < ca->mi.bucket_size && buckets) {
- buckets--;
- sectors = bucket_size_aligned;
- }
-
- return (struct journal_space) {
- .next_entry = sectors,
- .total = sectors + buckets * bucket_size_aligned,
- };
-}
-
-static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
- enum journal_space_from from)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned pos, nr_devs = 0;
- struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
- unsigned min_bucket_size = U32_MAX;
-
- BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
-
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- if (!ca->journal.nr ||
- !ca->mi.durability)
- continue;
-
- min_bucket_size = min(min_bucket_size, ca->mi.bucket_size);
-
- space = journal_dev_space_available(j, ca, from);
- if (!space.next_entry)
- continue;
-
- for (pos = 0; pos < nr_devs; pos++)
- if (space.total > dev_space[pos].total)
- break;
-
- array_insert_item(dev_space, nr_devs, pos, space);
- }
-
- if (nr_devs < nr_devs_want)
- return (struct journal_space) { 0, 0 };
-
- /*
- * It's possible for bucket size to be misaligned w.r.t. the filesystem
- * block size:
- */
- min_bucket_size = round_down(min_bucket_size, block_sectors(c));
-
- /*
- * We sorted largest to smallest, and we want the smallest out of the
- * @nr_devs_want largest devices:
- */
- space = dev_space[nr_devs_want - 1];
- space.next_entry = min(space.next_entry, min_bucket_size);
- return space;
-}
-
-void bch2_journal_space_available(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned clean, clean_ondisk, total;
- unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
- j->buf[1].buf_size >> 9);
- unsigned nr_online = 0, nr_devs_want;
- bool can_discard = false;
- int ret = 0;
-
- lockdep_assert_held(&j->lock);
- guard(rcu)();
-
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- while (ja->dirty_idx != ja->cur_idx &&
- ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
- ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-
- while (ja->dirty_idx_ondisk != ja->dirty_idx &&
- ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
- ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-
- can_discard |= __should_discard_bucket(j, ja);
-
- max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
- nr_online++;
- }
-
- j->can_discard = can_discard;
-
- if (nr_online < metadata_replicas_required(c)) {
- if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
- "rw journal devs:", nr_online, metadata_replicas_required(c));
-
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
- prt_printf(&buf, " %s", ca->name);
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- ret = bch_err_throw(c, insufficient_journal_devices);
- goto out;
- }
-
- nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
-
- for (unsigned i = 0; i < journal_space_nr; i++)
- j->space[i] = __journal_space_available(j, nr_devs_want, i);
-
- clean_ondisk = j->space[journal_space_clean_ondisk].total;
- clean = j->space[journal_space_clean].total;
- total = j->space[journal_space_total].total;
-
- if (!j->space[journal_space_discarded].next_entry)
- ret = bch_err_throw(c, journal_full);
-
- if ((j->space[journal_space_clean_ondisk].next_entry <
- j->space[journal_space_clean_ondisk].total) &&
- (clean - clean_ondisk <= total / 8) &&
- (clean_ondisk * 2 > clean))
- set_bit(JOURNAL_may_skip_flush, &j->flags);
- else
- clear_bit(JOURNAL_may_skip_flush, &j->flags);
-
- bch2_journal_set_watermark(j);
-out:
- j->cur_entry_sectors = !ret
- ? j->space[journal_space_discarded].next_entry
- : 0;
- j->cur_entry_error = ret;
-
- if (!ret)
- journal_wake(j);
-}
-
-/* Discards - last part of journal reclaim: */
-
-static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- unsigned min_free = max(4, ja->nr / 8);
-
- return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
- min_free &&
- ja->discard_idx != ja->dirty_idx_ondisk;
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- spin_lock(&j->lock);
- bool ret = __should_discard_bucket(j, ja);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/*
- * Advance ja->discard_idx as long as it points to buckets that are no longer
- * dirty, issuing discards if necessary:
- */
-void bch2_journal_do_discards(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- mutex_lock(&j->discard_lock);
-
- for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
- struct journal_device *ja = &ca->journal;
-
- while (should_discard_bucket(j, ja)) {
- if (!c->opts.nochanges &&
- bch2_discard_opt_enabled(c, ca) &&
- bdev_max_discard_sectors(ca->disk_sb.bdev))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->discard_idx]),
- ca->mi.bucket_size, GFP_NOFS);
-
- spin_lock(&j->lock);
- ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-
- bch2_journal_space_available(j);
- spin_unlock(&j->lock);
- }
- }
-
- mutex_unlock(&j->discard_lock);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, holding it open to ensure it gets replayed during recovery:
- */
-
-void bch2_journal_reclaim_fast(struct journal *j)
-{
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!fifo_empty(&j->pin) &&
- j->pin.front <= j->seq_ondisk &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
- j->pin.front++;
- popped = true;
- }
-
- if (popped) {
- bch2_journal_space_available(j);
- __closure_wake_up(&j->reclaim_flush_wait);
- }
-}
-
-bool __bch2_journal_pin_put(struct journal *j, u64 seq)
-{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- return atomic_dec_and_test(&pin_list->count);
-}
-
-void bch2_journal_pin_put(struct journal *j, u64 seq)
-{
- if (__bch2_journal_pin_put(j, seq)) {
- spin_lock(&j->lock);
- bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
- }
-}
-
-static inline bool __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list;
-
- if (!journal_pin_active(pin))
- return false;
-
- if (j->flush_in_progress == pin)
- j->flush_in_progress_dropped = true;
-
- pin_list = journal_seq_pin(j, pin->seq);
- pin->seq = 0;
- list_del_init(&pin->list);
-
- if (j->reclaim_flush_wait.list.first)
- __closure_wake_up(&j->reclaim_flush_wait);
-
- /*
- * Unpinning a journal entry may make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- */
- return atomic_dec_and_test(&pin_list->count) &&
- pin_list == &fifo_peek_front(&j->pin);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- spin_lock(&j->lock);
- if (__journal_pin_drop(j, pin))
- bch2_journal_reclaim_fast(j);
- spin_unlock(&j->lock);
-}
-
-static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
- journal_pin_flush_fn fn)
-{
- if (fn == bch2_btree_node_flush0 ||
- fn == bch2_btree_node_flush1) {
- unsigned idx = fn == bch2_btree_node_flush1;
- struct btree *b = container_of(pin, struct btree, writes[idx].journal);
-
- return JOURNAL_PIN_TYPE_btree0 - b->c.level;
- } else if (fn == bch2_btree_key_cache_journal_flush)
- return JOURNAL_PIN_TYPE_key_cache;
- else
- return JOURNAL_PIN_TYPE_other;
-}
-
-static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn,
- enum journal_pin_type type)
-{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- /*
- * flush_fn is how we identify journal pins in debugfs, so must always
- * exist, even if it doesn't do anything:
- */
- BUG_ON(!flush_fn);
-
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
-
- if (list_empty(&pin_list->unflushed[type]) &&
- j->reclaim_flush_wait.list.first)
- __closure_wake_up(&j->reclaim_flush_wait);
-
- list_add(&pin->list, &pin_list->unflushed[type]);
-}
-
-void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- u64 seq = READ_ONCE(src->seq);
-
- if (seq < journal_last_seq(j)) {
- /*
- * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
- * the src pin - with the pin dropped, the entry to pin might no
- * longer to exist, but that means there's no longer anything to
- * copy and we can bail out here:
- */
- spin_unlock(&j->lock);
- return;
- }
-
- bool reclaim = __journal_pin_drop(j, dst);
-
- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
-
- if (reclaim)
- bch2_journal_reclaim_fast(j);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- if (seq == journal_last_seq(j))
- journal_wake(j);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_set(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- BUG_ON(seq < journal_last_seq(j));
-
- bool reclaim = __journal_pin_drop(j, pin);
-
- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
-
- if (reclaim)
- bch2_journal_reclaim_fast(j);
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- if (seq == journal_last_seq(j))
- journal_wake(j);
-
- spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_pin_flush: ensure journal pin callback is no longer running
- * @j: journal object
- * @pin: pin to flush
- */
-void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
-{
- BUG_ON(journal_pin_active(pin));
-
- wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
-}
-
-/*
- * Journal reclaim: flush references to open journal entries to reclaim space in
- * the journal
- *
- * May be done by the journal code in the background as needed to free up space
- * for more journal entries, or as part of doing a clean shutdown, or to migrate
- * data off of a specific device:
- */
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j,
- u64 seq_to_flush,
- unsigned allowed_below_seq,
- unsigned allowed_above_seq,
- u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret = NULL;
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
- if (*seq > seq_to_flush && !allowed_above_seq)
- break;
-
- for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
- if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
- (BIT(i) & allowed_above_seq)) {
- ret = list_first_entry_or_null(&pin_list->unflushed[i],
- struct journal_entry_pin, list);
- if (ret)
- return ret;
- }
- }
-
- return NULL;
-}
-
-/* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j,
- u64 seq_to_flush,
- unsigned allowed_below_seq,
- unsigned allowed_above_seq,
- unsigned min_any,
- unsigned min_key_cache)
-{
- struct journal_entry_pin *pin;
- size_t nr_flushed = 0;
- journal_pin_flush_fn flush_fn;
- u64 seq;
- int err;
-
- lockdep_assert_held(&j->reclaim_lock);
-
- while (1) {
- unsigned allowed_above = allowed_above_seq;
- unsigned allowed_below = allowed_below_seq;
-
- if (min_any) {
- allowed_above |= ~0;
- allowed_below |= ~0;
- }
-
- if (min_key_cache) {
- allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
- allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
- }
-
- cond_resched();
-
- j->last_flushed = jiffies;
-
- spin_lock(&j->lock);
- pin = journal_get_next_pin(j, seq_to_flush,
- allowed_below,
- allowed_above, &seq);
- if (pin) {
- BUG_ON(j->flush_in_progress);
- j->flush_in_progress = pin;
- j->flush_in_progress_dropped = false;
- flush_fn = pin->flush;
- }
- spin_unlock(&j->lock);
-
- if (!pin)
- break;
-
- if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
- min_key_cache--;
-
- if (min_any)
- min_any--;
-
- err = flush_fn(j, pin, seq);
-
- spin_lock(&j->lock);
- /* Pin might have been dropped or rearmed: */
- if (likely(!err && !j->flush_in_progress_dropped))
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
- j->flush_in_progress = NULL;
- j->flush_in_progress_dropped = false;
- spin_unlock(&j->lock);
-
- wake_up(&j->pin_flush_wait);
-
- if (err)
- break;
-
- nr_flushed++;
- }
-
- return nr_flushed;
-}
-
-static u64 journal_seq_to_flush(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- u64 seq_to_flush = 0;
-
- guard(spinlock)(&j->lock);
- guard(rcu)();
-
- for_each_rw_member_rcu(c, ca) {
- struct journal_device *ja = &ca->journal;
- unsigned nr_buckets, bucket_to_flush;
-
- if (!ja->nr)
- continue;
-
- /* Try to keep the journal at most half full: */
- nr_buckets = ja->nr / 2;
-
- bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
- seq_to_flush = max(seq_to_flush,
- ja->bucket_seq[bucket_to_flush]);
- }
-
- /* Also flush if the pin fifo is more than half full */
- return max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
-}
-
-/**
- * __bch2_journal_reclaim - free up journal buckets
- * @j: journal object
- * @direct: direct or background reclaim?
- * @kicked: requested to run since we last ran?
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct btree_cache *bc = &c->btree_cache;
- bool kthread = (current->flags & PF_KTHREAD) != 0;
- u64 seq_to_flush;
- size_t min_nr, min_key_cache, nr_flushed;
- unsigned flags;
- int ret = 0;
-
- /*
- * We can't invoke memory reclaim while holding the reclaim_lock -
- * journal reclaim is required to make progress for memory reclaim
- * (cleaning the caches), so we can't get stuck in memory reclaim while
- * we're holding the reclaim lock:
- */
- lockdep_assert_held(&j->reclaim_lock);
- flags = memalloc_noreclaim_save();
-
- do {
- if (kthread && kthread_should_stop())
- break;
-
- ret = bch2_journal_error(j);
- if (ret)
- break;
-
- /* XXX shove journal discards off to another thread */
- bch2_journal_do_discards(j);
-
- seq_to_flush = journal_seq_to_flush(j);
- min_nr = 0;
-
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- if (time_after(jiffies, j->last_flushed +
- msecs_to_jiffies(c->opts.journal_reclaim_delay)))
- min_nr = 1;
-
- if (j->watermark != BCH_WATERMARK_stripe)
- min_nr = 1;
-
- size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
- if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
- min_nr = 1;
-
- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
- trace_and_count(c, journal_reclaim_start, c,
- direct, kicked,
- min_nr, min_key_cache,
- atomic_long_read(&bc->nr_dirty), btree_cache_live,
- atomic_long_read(&c->btree_key_cache.nr_dirty),
- atomic_long_read(&c->btree_key_cache.nr_keys));
-
- nr_flushed = journal_flush_pins(j, seq_to_flush,
- ~0, 0,
- min_nr, min_key_cache);
-
- if (direct)
- j->nr_direct_reclaim += nr_flushed;
- else
- j->nr_background_reclaim += nr_flushed;
- trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
-
- if (nr_flushed)
- wake_up(&j->reclaim_wait);
- } while ((min_nr || min_key_cache) && nr_flushed && !direct);
-
- memalloc_noreclaim_restore(flags);
-
- return ret;
-}
-
-int bch2_journal_reclaim(struct journal *j)
-{
- return __bch2_journal_reclaim(j, true, true);
-}
-
-static int bch2_journal_reclaim_thread(void *arg)
-{
- struct journal *j = arg;
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned long delay, now;
- bool journal_empty;
- int ret = 0;
-
- set_freezable();
-
- j->last_flushed = jiffies;
-
- while (!ret && !kthread_should_stop()) {
- bool kicked = j->reclaim_kicked;
-
- j->reclaim_kicked = false;
-
- mutex_lock(&j->reclaim_lock);
- ret = __bch2_journal_reclaim(j, false, kicked);
- mutex_unlock(&j->reclaim_lock);
-
- now = jiffies;
- delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
- j->next_reclaim = j->last_flushed + delay;
-
- if (!time_in_range(j->next_reclaim, now, now + delay))
- j->next_reclaim = now + delay;
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- if (kthread_should_stop())
- break;
- if (j->reclaim_kicked)
- break;
-
- spin_lock(&j->lock);
- journal_empty = fifo_empty(&j->pin);
- spin_unlock(&j->lock);
-
- long timeout = j->next_reclaim - jiffies;
-
- if (journal_empty)
- schedule();
- else if (timeout > 0)
- schedule_timeout(timeout);
- else
- break;
- }
- __set_current_state(TASK_RUNNING);
- }
-
- return 0;
-}
-
-void bch2_journal_reclaim_stop(struct journal *j)
-{
- struct task_struct *p = j->reclaim_thread;
-
- j->reclaim_thread = NULL;
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-int bch2_journal_reclaim_start(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct task_struct *p;
- int ret;
-
- if (j->reclaim_thread)
- return 0;
-
- p = kthread_create(bch2_journal_reclaim_thread, j,
- "bch-reclaim/%s", c->name);
- ret = PTR_ERR_OR_ZERO(p);
- bch_err_msg(c, ret, "creating journal reclaim thread");
- if (ret)
- return ret;
-
- get_task_struct(p);
- j->reclaim_thread = p;
- wake_up_process(p);
- return 0;
-}
-
-static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
- unsigned types)
-{
- struct journal_entry_pin_list *pin_list;
- u64 seq;
-
- spin_lock(&j->lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
- if (seq > seq_to_flush)
- break;
-
- for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
- if ((BIT(i) & types) &&
- (!list_empty(&pin_list->unflushed[i]) ||
- !list_empty(&pin_list->flushed[i]))) {
- spin_unlock(&j->lock);
- return true;
- }
- }
- spin_unlock(&j->lock);
-
- return false;
-}
-
-static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
- unsigned types)
-{
- return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
- journal_pins_still_flushing(j, seq_to_flush, types);
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
- bool *did_work)
-{
- int ret = 0;
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- mutex_lock(&j->reclaim_lock);
-
- for (int type = JOURNAL_PIN_TYPE_NR - 1;
- type >= 0;
- --type)
- if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
- *did_work = true;
- goto unlock;
- }
-
- if (seq_to_flush > journal_cur_seq(j))
- bch2_journal_entry_close(j);
-
- spin_lock(&j->lock);
- /*
- * If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers
- */
- ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
- journal_last_seq(j) > seq_to_flush ||
- !fifo_used(&j->pin);
-
- spin_unlock(&j->lock);
-unlock:
- mutex_unlock(&j->reclaim_lock);
-
- return ret;
-}
-
-bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
- /* time_stats this */
- bool did_work = false;
-
- if (!test_bit(JOURNAL_running, &j->flags))
- return false;
-
- closure_wait_event(&j->reclaim_flush_wait,
- journal_flush_done(j, seq_to_flush, &did_work));
-
- return did_work;
-}
-
-int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin_list *p;
- u64 iter, seq = 0;
- int ret = 0;
-
- spin_lock(&j->lock);
- fifo_for_each_entry_ptr(p, &j->pin, iter)
- if (dev_idx >= 0
- ? bch2_dev_list_has_dev(p->devs, dev_idx)
- : p->devs.nr < c->opts.metadata_replicas)
- seq = iter;
- spin_unlock(&j->lock);
-
- bch2_journal_flush_pins(j, seq);
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
-
- /*
- * Now that we've populated replicas_gc, write to the journal to mark
- * active journal devices. This handles the case where the journal might
- * be empty. Otherwise we could clear all journal replicas and
- * temporarily put the fs into an unrecoverable state. Journal recovery
- * expects to find devices marked for journal data on unclean mount.
- */
- ret = bch2_journal_meta(&c->journal);
- if (ret)
- goto err;
-
- seq = 0;
- spin_lock(&j->lock);
- while (!ret) {
- union bch_replicas_padded replicas;
-
- seq = max(seq, journal_last_seq(j));
- if (seq >= j->pin.back)
- break;
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- journal_seq_pin(j, seq)->devs);
- seq++;
-
- if (replicas.e.nr_devs) {
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, &replicas.e);
- spin_lock(&j->lock);
- }
- }
- spin_unlock(&j->lock);
-err:
- ret = bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
-
- return ret;
-}
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *pin;
-
- spin_lock(&j->lock);
- if (!test_bit(JOURNAL_running, &j->flags)) {
- spin_unlock(&j->lock);
- return true;
- }
-
- *seq = max(*seq, j->pin.front);
-
- if (*seq >= j->pin.back) {
- spin_unlock(&j->lock);
- return true;
- }
-
- out->atomic++;
-
- pin_list = journal_seq_pin(j, *seq);
-
- prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "unflushed:\n");
- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
- list_for_each_entry(pin, &pin_list->unflushed[i], list)
- prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
- prt_printf(out, "flushed:\n");
- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
- list_for_each_entry(pin, &pin_list->flushed[i], list)
- prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
- printbuf_indent_sub(out, 2);
-
- --out->atomic;
- spin_unlock(&j->lock);
-
- return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
- u64 seq = 0;
-
- while (!bch2_journal_seq_pins_to_text(out, j, &seq))
- seq++;
-}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
deleted file mode 100644
index 0a73d7134e1c..000000000000
--- a/fs/bcachefs/journal_reclaim.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
-#define _BCACHEFS_JOURNAL_RECLAIM_H
-
-#define JOURNAL_PIN (32 * 1024)
-
-static inline void journal_reclaim_kick(struct journal *j)
-{
- struct task_struct *p = READ_ONCE(j->reclaim_thread);
-
- j->reclaim_kicked = true;
- if (p)
- wake_up_process(p);
-}
-
-unsigned bch2_journal_dev_buckets_available(struct journal *,
- struct journal_device *,
- enum journal_space_from);
-void bch2_journal_set_watermark(struct journal *);
-void bch2_journal_space_available(struct journal *);
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
- return pin->seq != 0;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
- EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
-
- return &j->pin.data[seq & j->pin.mask];
-}
-
-void bch2_journal_reclaim_fast(struct journal *);
-bool __bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_put(struct journal *, u64);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
- bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_copy(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-
-static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
- bch2_journal_pin_set(j, seq, pin, flush_fn);
-}
-
-void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_do_discards(struct journal *);
-int bch2_journal_reclaim(struct journal *);
-
-void bch2_journal_reclaim_stop(struct journal *);
-int bch2_journal_reclaim_start(struct journal *);
-
-bool bch2_journal_flush_pins(struct journal *, u64);
-
-static inline bool bch2_journal_flush_all_pins(struct journal *j)
-{
- return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-int bch2_journal_flush_device_pins(struct journal *, int);
-
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
-
-#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
deleted file mode 100644
index 0cb9b93f13e7..000000000000
--- a/fs/bcachefs/journal_sb.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "journal_sb.h"
-#include "darray.h"
-
-#include <linux/sort.h>
-
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
- const u64 *l = _l;
- const u64 *r = _r;
-
- return cmp_int(*l, *r);
-}
-
-static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
- int ret = -BCH_ERR_invalid_sb_journal;
- unsigned nr;
- unsigned i;
- u64 *b;
-
- nr = bch2_nr_journal_buckets(journal);
- if (!nr)
- return 0;
-
- b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
- if (!b)
- return -BCH_ERR_ENOMEM_sb_journal_validate;
-
- for (i = 0; i < nr; i++)
- b[i] = le64_to_cpu(journal->buckets[i]);
-
- sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
- if (!b[0]) {
- prt_printf(err, "journal bucket at sector 0");
- goto err;
- }
-
- if (b[0] < le16_to_cpu(m.first_bucket)) {
- prt_printf(err, "journal bucket %llu before first bucket %u",
- b[0], le16_to_cpu(m.first_bucket));
- goto err;
- }
-
- if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
- prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1], le64_to_cpu(m.nbuckets));
- goto err;
- }
-
- for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1]) {
- prt_printf(err, "duplicate journal buckets %llu", b[i]);
- goto err;
- }
-
- ret = 0;
-err:
- kfree(b);
- return ret;
-}
-
-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- unsigned i, nr = bch2_nr_journal_buckets(journal);
-
- prt_printf(out, "Buckets: ");
- for (i = 0; i < nr; i++)
- prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal = {
- .validate = bch2_sb_journal_validate,
- .to_text = bch2_sb_journal_to_text,
-};
-
-struct u64_range {
- u64 start;
- u64 end;
-};
-
-static int u64_range_cmp(const void *_l, const void *_r)
-{
- const struct u64_range *l = _l;
- const struct u64_range *r = _r;
-
- return cmp_int(l->start, r->start);
-}
-
-static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
- struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
- int ret = -BCH_ERR_invalid_sb_journal;
- u64 sum = 0;
- unsigned nr;
- unsigned i;
- struct u64_range *b;
-
- nr = bch2_sb_field_journal_v2_nr_entries(journal);
- if (!nr)
- return 0;
-
- b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
- if (!b)
- return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
-
- for (i = 0; i < nr; i++) {
- b[i].start = le64_to_cpu(journal->d[i].start);
- b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
-
- if (b[i].end <= b[i].start) {
- prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
- le64_to_cpu(journal->d[i].start),
- le64_to_cpu(journal->d[i].nr));
- goto err;
- }
-
- sum += le64_to_cpu(journal->d[i].nr);
- }
-
- sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
-
- if (!b[0].start) {
- prt_printf(err, "journal bucket at sector 0");
- goto err;
- }
-
- if (b[0].start < le16_to_cpu(m.first_bucket)) {
- prt_printf(err, "journal bucket %llu before first bucket %u",
- b[0].start, le16_to_cpu(m.first_bucket));
- goto err;
- }
-
- if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
- prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
- goto err;
- }
-
- for (i = 0; i + 1 < nr; i++) {
- if (b[i].end > b[i + 1].start) {
- prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
- b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
- goto err;
- }
- }
-
- if (sum > UINT_MAX) {
- prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
- goto err;
- }
-
- ret = 0;
-err:
- kfree(b);
- return ret;
-}
-
-static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
- unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
-
- prt_printf(out, "Buckets: ");
- for (i = 0; i < nr; i++)
- prt_printf(out, " %llu-%llu",
- le64_to_cpu(journal->d[i].start),
- le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
- .validate = bch2_sb_journal_v2_validate,
- .to_text = bch2_sb_journal_v2_to_text,
-};
-
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
- u64 *buckets, unsigned nr)
-{
- struct bch_sb_field_journal_v2 *j;
- unsigned i, dst = 0, nr_compacted = 1;
-
- if (c)
- lockdep_assert_held(&c->sb_lock);
-
- if (!nr) {
- bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
- bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
- return 0;
- }
-
- for (i = 0; i + 1 < nr; i++)
- if (buckets[i] + 1 != buckets[i + 1])
- nr_compacted++;
-
- j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
- (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
- if (!j)
- return bch_err_throw(c, ENOSPC_sb_journal);
-
- bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
-
- j->d[dst].start = cpu_to_le64(buckets[0]);
- j->d[dst].nr = cpu_to_le64(1);
-
- for (i = 1; i < nr; i++) {
- if (buckets[i] == buckets[i - 1] + 1) {
- le64_add_cpu(&j->d[dst].nr, 1);
- } else {
- dst++;
- j->d[dst].start = cpu_to_le64(buckets[i]);
- j->d[dst].nr = cpu_to_le64(1);
- }
- }
-
- BUG_ON(dst + 1 != nr_compacted);
- return 0;
-}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
deleted file mode 100644
index ba40a7e8d90a..000000000000
--- a/fs/bcachefs/journal_sb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include "super-io.h"
-#include "vstructs.h"
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
- return j
- ? (__le64 *) vstruct_end(&j->field) - j->buckets
- : 0;
-}
-
-static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
-{
- if (!j)
- return 0;
-
- return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
-
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
deleted file mode 100644
index af4fe416d9ec..000000000000
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ /dev/null
@@ -1,264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "eytzinger.h"
-#include "journal.h"
-#include "journal_seq_blacklist.h"
-#include "super-io.h"
-
-/*
- * journal_seq_blacklist machinery:
- *
- * To guarantee order of btree updates after a crash, we need to detect when a
- * btree node entry (bset) is newer than the newest journal entry that was
- * successfully written, and ignore it - effectively ignoring any btree updates
- * that didn't make it into the journal.
- *
- * If we didn't do this, we might have two btree nodes, a and b, both with
- * updates that weren't written to the journal yet: if b was updated after a,
- * but b was flushed and not a - oops; on recovery we'll find that the updates
- * to b happened, but not the updates to a that happened before it.
- *
- * Ignoring bsets that are newer than the newest journal entry is always safe,
- * because everything they contain will also have been journalled - and must
- * still be present in the journal on disk until a journal entry has been
- * written _after_ that bset was written.
- *
- * To accomplish this, bsets record the newest journal sequence number they
- * contain updates for; then, on startup, the btree code queries the journal
- * code to ask "Is this sequence number newer than the newest journal entry? If
- * so, ignore it."
- *
- * When this happens, we must blacklist that journal sequence number: the
- * journal must not write any entries with that sequence number, and it must
- * record that it was blacklisted so that a) on recovery we don't think we have
- * missing journal entries and b) so that the btree code continues to ignore
- * that bset, until that btree node is rewritten.
- */
-
-static unsigned sb_blacklist_u64s(unsigned nr)
-{
- struct bch_sb_field_journal_seq_blacklist *bl;
-
- return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
-}
-
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
-{
- struct bch_sb_field_journal_seq_blacklist *bl;
- unsigned i = 0, nr;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
- bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
- nr = blacklist_nr_entries(bl);
-
- while (i < nr) {
- struct journal_seq_blacklist_entry *e =
- bl->start + i;
-
- if (end < le64_to_cpu(e->start))
- break;
-
- if (start > le64_to_cpu(e->end)) {
- i++;
- continue;
- }
-
- /*
- * Entry is contiguous or overlapping with new entry: merge it
- * with new entry, and delete:
- */
-
- start = min(start, le64_to_cpu(e->start));
- end = max(end, le64_to_cpu(e->end));
- array_remove_item(bl->start, nr, i);
- }
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- sb_blacklist_u64s(nr + 1));
- if (!bl) {
- ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
- goto out;
- }
-
- array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
- .start = cpu_to_le64(start),
- .end = cpu_to_le64(end),
- }));
- c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
-
- ret = bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-
- return ret ?: bch2_blacklist_table_initialize(c);
-}
-
-static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
-{
- const struct journal_seq_blacklist_table_entry *l = _l;
- const struct journal_seq_blacklist_table_entry *r = _r;
-
- return cmp_int(l->start, r->start);
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
- bool dirty)
-{
- struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
- struct journal_seq_blacklist_table_entry search = { .start = seq };
- int idx;
-
- if (!t)
- return false;
-
- idx = eytzinger0_find_le(t->entries, t->nr,
- sizeof(t->entries[0]),
- journal_seq_blacklist_table_cmp,
- &search);
- if (idx < 0)
- return false;
-
- BUG_ON(t->entries[idx].start > seq);
-
- if (seq >= t->entries[idx].end)
- return false;
-
- if (dirty)
- t->entries[idx].dirty = true;
- return true;
-}
-
-u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c)
-{
- struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
-
- if (!t || !t->nr)
- return 0;
-
- return t->entries[eytzinger0_last(t->nr)].end - 1;
-}
-
-int bch2_blacklist_table_initialize(struct bch_fs *c)
-{
- struct bch_sb_field_journal_seq_blacklist *bl =
- bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
- struct journal_seq_blacklist_table *t;
- unsigned i, nr = blacklist_nr_entries(bl);
-
- if (!bl)
- return 0;
-
- t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
- if (!t)
- return bch_err_throw(c, ENOMEM_blacklist_table_init);
-
- t->nr = nr;
-
- for (i = 0; i < nr; i++) {
- t->entries[i].start = le64_to_cpu(bl->start[i].start);
- t->entries[i].end = le64_to_cpu(bl->start[i].end);
- }
-
- eytzinger0_sort(t->entries,
- t->nr,
- sizeof(t->entries[0]),
- journal_seq_blacklist_table_cmp,
- NULL);
-
- kfree(c->journal_seq_blacklist_table);
- c->journal_seq_blacklist_table = t;
- return 0;
-}
-
-static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_journal_seq_blacklist *bl =
- field_to_type(f, journal_seq_blacklist);
- unsigned i, nr = blacklist_nr_entries(bl);
-
- for (i = 0; i < nr; i++) {
- struct journal_seq_blacklist_entry *e = bl->start + i;
-
- if (le64_to_cpu(e->start) >=
- le64_to_cpu(e->end)) {
- prt_printf(err, "entry %u start >= end (%llu >= %llu)",
- i, le64_to_cpu(e->start), le64_to_cpu(e->end));
- return -BCH_ERR_invalid_sb_journal_seq_blacklist;
- }
-
- if (i + 1 < nr &&
- le64_to_cpu(e[0].end) >
- le64_to_cpu(e[1].start)) {
- prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
- i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
- return -BCH_ERR_invalid_sb_journal_seq_blacklist;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_journal_seq_blacklist *bl =
- field_to_type(f, journal_seq_blacklist);
- struct journal_seq_blacklist_entry *i;
- unsigned nr = blacklist_nr_entries(bl);
-
- for (i = bl->start; i < bl->start + nr; i++) {
- if (i != bl->start)
- prt_printf(out, " ");
-
- prt_printf(out, "%llu-%llu",
- le64_to_cpu(i->start),
- le64_to_cpu(i->end));
- }
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
- .validate = bch2_sb_journal_seq_blacklist_validate,
- .to_text = bch2_sb_journal_seq_blacklist_to_text
-};
-
-bool bch2_blacklist_entries_gc(struct bch_fs *c)
-{
- struct journal_seq_blacklist_entry *src, *dst;
-
- struct bch_sb_field_journal_seq_blacklist *bl =
- bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
- if (!bl)
- return false;
-
- unsigned nr = blacklist_nr_entries(bl);
- dst = bl->start;
-
- struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
- BUG_ON(nr != t->nr);
-
- src = bl->start;
- eytzinger0_for_each(i, nr) {
- BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
- BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
-
- if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
- *dst++ = *src;
- src++;
- }
-
- unsigned new_nr = dst - bl->start;
- if (new_nr == nr)
- return false;
-
- bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr);
-
- bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
- new_nr ? sb_blacklist_u64s(new_nr) : 0);
- BUG_ON(new_nr && !bl);
- return true;
-}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
deleted file mode 100644
index f06942ccfcdd..000000000000
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
-
-static inline unsigned
-blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
-{
- return bl
- ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
- sizeof(struct journal_seq_blacklist_entry))
- : 0;
-}
-
-bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
-u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
-int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
-int bch2_blacklist_table_initialize(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
-
-bool bch2_blacklist_entries_gc(struct bch_fs *);
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h
deleted file mode 100644
index 2566b12dbc04..000000000000
--- a/fs/bcachefs/journal_seq_blacklist_format.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
-
-struct journal_seq_blacklist_entry {
- __le64 start;
- __le64 end;
-};
-
-struct bch_sb_field_journal_seq_blacklist {
- struct bch_sb_field field;
- struct journal_seq_blacklist_entry start[];
-};
-
-#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
deleted file mode 100644
index 51104bbb99da..000000000000
--- a/fs/bcachefs/journal_types.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_JOURNAL_TYPES_H
-#define _BCACHEFS_JOURNAL_TYPES_H
-
-#include <linux/cache.h>
-#include <linux/workqueue.h>
-
-#include "alloc_types.h"
-#include "super_types.h"
-#include "fifo.h"
-
-/* btree write buffer steals 8 bits for its own purposes: */
-#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
-
-#define JOURNAL_STATE_BUF_BITS 2
-#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
-#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
-
-#define JOURNAL_BUF_BITS 4
-#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
-#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
-
-/*
- * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
- * the journal that are being staged or in flight.
- */
-struct journal_buf {
- struct closure io;
- struct jset *data;
-
- __BKEY_PADDED(key, BCH_REPLICAS_MAX);
- struct bch_devs_list devs_written;
-
- struct closure_waitlist wait;
- u64 last_seq; /* copy of data->last_seq */
- long expires;
- u64 flush_time;
-
- unsigned buf_size; /* size in bytes of @data */
- unsigned sectors; /* maximum size for current entry */
- unsigned disk_sectors; /* maximum size entry could have been, if
- buf_size was bigger */
- unsigned u64s_reserved;
- bool noflush:1; /* write has already been kicked off, and was noflush */
- bool must_flush:1; /* something wants a flush */
- bool separate_flush:1;
- bool need_flush_to_write_buffer:1;
- bool write_started:1;
- bool write_allocated:1;
- bool write_done:1;
- u8 idx;
-};
-
-/*
- * Something that makes a journal entry dirty - i.e. a btree node that has to be
- * flushed:
- */
-
-enum journal_pin_type {
- JOURNAL_PIN_TYPE_btree3,
- JOURNAL_PIN_TYPE_btree2,
- JOURNAL_PIN_TYPE_btree1,
- JOURNAL_PIN_TYPE_btree0,
- JOURNAL_PIN_TYPE_key_cache,
- JOURNAL_PIN_TYPE_other,
- JOURNAL_PIN_TYPE_NR,
-};
-
-struct journal_entry_pin_list {
- struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
- struct list_head flushed[JOURNAL_PIN_TYPE_NR];
- atomic_t count;
- struct bch_devs_list devs;
-};
-
-struct journal;
-struct journal_entry_pin;
-typedef int (*journal_pin_flush_fn)(struct journal *j,
- struct journal_entry_pin *, u64);
-
-struct journal_entry_pin {
- struct list_head list;
- journal_pin_flush_fn flush;
- u64 seq;
-};
-
-struct journal_res {
- bool ref;
- u16 u64s;
- u32 offset;
- u64 seq;
-};
-
-union journal_res_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 cur_entry_offset:22,
- idx:2,
- buf0_count:10,
- buf1_count:10,
- buf2_count:10,
- buf3_count:10;
- };
-};
-
-/* bytes: */
-#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
-
-/*
- * We stash some journal state as sentinal values in cur_entry_offset:
- * note - cur_entry_offset is in units of u64s
- */
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
-
-#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
-#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
-#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
-
-struct journal_space {
- /* Units of 512 bytes sectors: */
- unsigned next_entry; /* How big the next journal entry can be */
- unsigned total;
-};
-
-enum journal_space_from {
- journal_space_discarded,
- journal_space_clean_ondisk,
- journal_space_clean,
- journal_space_total,
- journal_space_nr,
-};
-
-#define JOURNAL_FLAGS() \
- x(replay_done) \
- x(running) \
- x(may_skip_flush) \
- x(need_flush_write) \
- x(space_low)
-
-enum journal_flags {
-#define x(n) JOURNAL_##n,
- JOURNAL_FLAGS()
-#undef x
-};
-
-struct journal_bio {
- struct bch_dev *ca;
- unsigned buf_idx;
- u64 submit_time;
-
- struct bio bio;
-};
-
-/* Embedded in struct bch_fs */
-struct journal {
- /* Fastpath stuff up front: */
- struct {
-
- union journal_res_state reservations;
- enum bch_watermark watermark;
-
- } __aligned(SMP_CACHE_BYTES);
-
- unsigned long flags;
-
- /* Max size of current journal entry */
- unsigned cur_entry_u64s;
- unsigned cur_entry_sectors;
-
- /* Reserved space in journal entry to be used just prior to write */
- unsigned entry_u64s_reserved;
-
-
- /*
- * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
- * insufficient devices:
- */
- int cur_entry_error;
- unsigned cur_entry_offset_if_blocked;
-
- unsigned buf_size_want;
- /*
- * We may queue up some things to be journalled (log messages) before
- * the journal has actually started - stash them here:
- */
- darray_u64 early_journal_entries;
-
- /*
- * Protects journal_buf->data, when accessing without a jorunal
- * reservation: for synchronization between the btree write buffer code
- * and the journal write path:
- */
- struct mutex buf_lock;
- /*
- * Two journal entries -- one is currently open for new entries, the
- * other is possibly being written out.
- */
- struct journal_buf buf[JOURNAL_BUF_NR];
- void *free_buf;
- unsigned free_buf_size;
-
- spinlock_t lock;
-
- /* if nonzero, we may not open a new journal entry: */
- unsigned blocked;
-
- /* Used when waiting because the journal was full */
- wait_queue_head_t wait;
- struct closure_waitlist async_wait;
- struct closure_waitlist reclaim_flush_wait;
-
- struct delayed_work write_work;
- struct workqueue_struct *wq;
-
- /* Sequence number of most recent journal entry (last entry in @pin) */
- atomic64_t seq;
-
- u64 seq_write_started;
- /* seq, last_seq from the most recent journal entry successfully written */
- u64 seq_ondisk;
- u64 flushed_seq_ondisk;
- u64 flushing_seq;
- u64 last_seq_ondisk;
- u64 err_seq;
- u64 last_empty_seq;
- u64 oldest_seq_found_ondisk;
-
- /*
- * FIFO of journal entries whose btree updates have not yet been
- * written out.
- *
- * Each entry is a reference count. The position in the FIFO is the
- * entry's sequence number relative to @seq.
- *
- * The journal entry itself holds a reference count, put when the
- * journal entry is written out. Each btree node modified by the journal
- * entry also holds a reference count, put when the btree node is
- * written.
- *
- * When a reference count reaches zero, the journal entry is no longer
- * needed. When all journal entries in the oldest journal bucket are no
- * longer needed, the bucket can be discarded and reused.
- */
- struct {
- u64 front, back, size, mask;
- struct journal_entry_pin_list *data;
- } pin;
-
- struct journal_space space[journal_space_nr];
-
- u64 replay_journal_seq;
- u64 replay_journal_seq_end;
-
- struct write_point wp;
- spinlock_t err_lock;
-
- struct mutex reclaim_lock;
- /*
- * Used for waiting until journal reclaim has freed up space in the
- * journal:
- */
- wait_queue_head_t reclaim_wait;
- struct task_struct *reclaim_thread;
- bool reclaim_kicked;
- unsigned long next_reclaim;
- u64 nr_direct_reclaim;
- u64 nr_background_reclaim;
-
- unsigned long last_flushed;
- struct journal_entry_pin *flush_in_progress;
- bool flush_in_progress_dropped;
- wait_queue_head_t pin_flush_wait;
-
- /* protects advancing ja->discard_idx: */
- struct mutex discard_lock;
- bool can_discard;
-
- unsigned long last_flush_write;
-
- u64 write_start_time;
-
- u64 nr_flush_writes;
- u64 nr_noflush_writes;
- u64 entry_bytes_written;
-
- struct bch2_time_stats *flush_write_time;
- struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *flush_seq_time;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map res_map;
-#endif
-} __aligned(SMP_CACHE_BYTES);
-
-/*
- * Embedded in struct bch_dev. First three fields refer to the array of journal
- * buckets, in bch_sb.
- */
-struct journal_device {
- /*
- * For each journal bucket, contains the max sequence number of the
- * journal writes it contains - so we know when a bucket can be reused.
- */
- u64 *bucket_seq;
-
- unsigned sectors_free;
-
- /*
- * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
- */
- unsigned discard_idx; /* Next bucket to discard */
- unsigned dirty_idx_ondisk;
- unsigned dirty_idx;
- unsigned cur_idx; /* Journal bucket we're currently writing to */
- unsigned nr;
-
- u64 *buckets;
-
- /* Bio for journal reads/writes to this device */
- struct journal_bio *bio[JOURNAL_BUF_NR];
-
- /* for bch_journal_read_device */
- struct closure read;
- u64 highest_seq_found;
-};
-
-/*
- * journal_entry_res - reserve space in every journal entry:
- */
-struct journal_entry_res {
- unsigned u64s;
-};
-
-#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
deleted file mode 100644
index 1b828bddd11b..000000000000
--- a/fs/bcachefs/keylist.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey.h"
-#include "keylist.h"
-
-int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
- size_t nr_inline_u64s, size_t new_u64s)
-{
- size_t oldsize = bch2_keylist_u64s(l);
- size_t newsize = oldsize + new_u64s;
- u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
- u64 *new_keys;
-
- newsize = roundup_pow_of_two(newsize);
-
- if (newsize <= nr_inline_u64s ||
- (old_buf && roundup_pow_of_two(oldsize) == newsize))
- return 0;
-
- new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
- if (!new_keys)
- return -ENOMEM;
-
- if (!old_buf)
- memcpy_u64s(new_keys, inline_u64s, oldsize);
-
- l->keys_p = new_keys;
- l->top_p = new_keys + oldsize;
-
- return 0;
-}
-
-void bch2_keylist_pop_front(struct keylist *l)
-{
- l->top_p -= bch2_keylist_front(l)->k.u64s;
-
- memmove_u64s_down(l->keys,
- bkey_next(l->keys),
- bch2_keylist_u64s(l));
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *l)
-{
- for_each_keylist_key(l, k)
- BUG_ON(bkey_next(k) != l->top &&
- bpos_ge(k->k.p, bkey_next(k)->k.p));
-}
-#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
deleted file mode 100644
index e687e0e9aede..000000000000
--- a/fs/bcachefs/keylist.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_H
-#define _BCACHEFS_KEYLIST_H
-
-#include "keylist_types.h"
-
-int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_pop_front(struct keylist *);
-
-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
-{
- l->top_p = l->keys_p = inline_keys;
-}
-
-static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
-{
- if (l->keys_p != inline_keys)
- kfree(l->keys_p);
-}
-
-static inline void bch2_keylist_push(struct keylist *l)
-{
- l->top = bkey_next(l->top);
-}
-
-static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
-{
- bkey_copy(l->top, k);
- bch2_keylist_push(l);
-}
-
-static inline bool bch2_keylist_empty(struct keylist *l)
-{
- return l->top == l->keys;
-}
-
-static inline size_t bch2_keylist_u64s(struct keylist *l)
-{
- return l->top_p - l->keys_p;
-}
-
-static inline size_t bch2_keylist_bytes(struct keylist *l)
-{
- return bch2_keylist_u64s(l) * sizeof(u64);
-}
-
-static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
-{
- return l->keys;
-}
-
-#define for_each_keylist_key(_keylist, _k) \
- for (struct bkey_i *_k = (_keylist)->keys; \
- _k != (_keylist)->top; \
- _k = bkey_next(_k))
-
-static inline u64 keylist_sectors(struct keylist *keys)
-{
- u64 ret = 0;
-
- for_each_keylist_key(keys, k)
- ret += k->k.size;
- return ret;
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_verify_keylist_sorted(struct keylist *);
-#else
-static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
-#endif
-
-#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
deleted file mode 100644
index 4b3ff7d8a875..000000000000
--- a/fs/bcachefs/keylist_types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_KEYLIST_TYPES_H
-#define _BCACHEFS_KEYLIST_TYPES_H
-
-struct keylist {
- union {
- struct bkey_i *keys;
- u64 *keys_p;
- };
- union {
- struct bkey_i *top;
- u64 *top_p;
- };
-};
-
-#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
deleted file mode 100644
index 75f27ec26f85..000000000000
--- a/fs/bcachefs/logged_ops.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "error.h"
-#include "io_misc.h"
-#include "logged_ops.h"
-#include "super.h"
-
-struct bch_logged_op_fn {
- u8 type;
- int (*resume)(struct btree_trans *, struct bkey_i *);
-};
-
-static const struct bch_logged_op_fn logged_op_fns[] = {
-#define x(n) { \
- .type = KEY_TYPE_logged_op_##n, \
- .resume = bch2_resume_logged_op_##n, \
-},
- BCH_LOGGED_OPS()
-#undef x
-};
-
-static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
-{
- for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
- if (logged_op_fns[i].type == type)
- return logged_op_fns + i;
- return NULL;
-}
-
-static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
- trans, logged_op_but_clean,
- "filesystem marked as clean but have logged op\n%s",
- (bch2_bkey_val_to_text(&buf, c, k),
- buf.buf));
-
- struct bkey_buf sk;
- bch2_bkey_buf_init(&sk);
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
- if (fn)
- fn->resume(trans, sk.k);
-
- ret = bch2_logged_op_finish(trans, sk.k);
-
- bch2_bkey_buf_exit(&sk, c);
-fsck_err:
- printbuf_exit(&buf);
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter,
- BTREE_ID_logged_ops,
- POS(LOGGED_OPS_INUM_logged_ops, 0),
- POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
- BTREE_ITER_prefetch, k,
- resume_logged_op(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
- struct btree_iter iter;
- int ret = bch2_bkey_get_empty_slot(trans, &iter,
- BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
- if (ret)
- return ret;
-
- k->k.p = iter.pos;
-
- ret = bch2_trans_update(trans, &iter, k, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
-{
- return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_logged_op_start(trans, k));
-}
-
-int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
-{
- int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
- /*
- * This needs to be a fatal error because we've left an unfinished
- * operation in the logged ops btree.
- *
- * We should only ever see an error here if the filesystem has already
- * been shut down, but make sure of that here:
- */
- if (ret) {
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
- buf.buf, bch2_err_str(ret));
- printbuf_exit(&buf);
- }
-
- return ret;
-}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
deleted file mode 100644
index 30ae9ef737dd..000000000000
--- a/fs/bcachefs/logged_ops.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_H
-#define _BCACHEFS_LOGGED_OPS_H
-
-#include "bkey.h"
-
-#define BCH_LOGGED_OPS() \
- x(truncate) \
- x(finsert)
-
-static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
-{
- return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
-}
-
-int bch2_resume_logged_ops(struct bch_fs *);
-int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
-
-#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
deleted file mode 100644
index cfb67c95d4c8..000000000000
--- a/fs/bcachefs/logged_ops_format.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
-#define _BCACHEFS_LOGGED_OPS_FORMAT_H
-
-enum logged_ops_inums {
- LOGGED_OPS_INUM_logged_ops,
- LOGGED_OPS_INUM_inode_cursors,
-};
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
-#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
deleted file mode 100644
index 57b5b3263b08..000000000000
--- a/fs/bcachefs/lru.c
+++ /dev/null
@@ -1,223 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "ec.h"
-#include "error.h"
-#include "lru.h"
-#include "recovery.h"
-
-/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(!lru_pos_time(k.k->p),
- c, lru_entry_at_time_0,
- "lru entry at time=0");
-fsck_err:
- return ret;
-}
-
-void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
-
- prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
-}
-
-void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
-{
- prt_printf(out, "%llu:%llu -> %llu:%llu",
- lru_pos_id(lru),
- lru_pos_time(lru),
- u64_to_bucket(lru.offset).inode,
- u64_to_bucket(lru.offset).offset);
-}
-
-static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
- u64 dev_bucket, u64 time, bool set)
-{
- return time
- ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), set)
- : 0;
-}
-
-int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
- return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
-}
-
-int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
- return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
-}
-
-int __bch2_lru_change(struct btree_trans *trans,
- u16 lru_id, u64 dev_bucket,
- u64 old_time, u64 new_time)
-{
- if (old_time == new_time)
- return 0;
-
- return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
- bch2_lru_set(trans, lru_id, dev_bucket, new_time);
-}
-
-static const char * const bch2_lru_types[] = {
-#define x(n) #n,
- BCH_LRU_TYPES()
-#undef x
- NULL
-};
-
-int bch2_lru_check_set(struct btree_trans *trans,
- u16 lru_id,
- u64 dev_bucket,
- u64 time,
- struct bkey_s_c referring_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter lru_iter;
- struct bkey_s_c lru_k =
- bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(lru_id, dev_bucket, time), 0);
- int ret = bkey_err(lru_k);
- if (ret)
- return ret;
-
- if (lru_k.k->type != KEY_TYPE_set) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
- if (ret)
- goto err;
-
- if (fsck_err(trans, alloc_key_to_missing_lru_entry,
- "missing %s lru entry\n%s",
- bch2_lru_types[lru_type(lru_k)],
- (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
- ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
- if (ret)
- goto err;
- }
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &lru_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
-{
- enum bch_lru_type type = lru_type(lru_k);
-
- switch (type) {
- case BCH_LRU_read:
- case BCH_LRU_fragmentation:
- return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
- case BCH_LRU_stripes:
- return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
- default:
- BUG();
- }
-}
-
-static u64 bkey_lru_type_idx(struct bch_fs *c,
- enum bch_lru_type type,
- struct bkey_s_c k)
-{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
-
- switch (type) {
- case BCH_LRU_read:
- a = bch2_alloc_to_v4(k, &a_convert);
- return alloc_lru_idx_read(*a);
- case BCH_LRU_fragmentation: {
- a = bch2_alloc_to_v4(k, &a_convert);
-
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
- return ca
- ? alloc_lru_idx_fragmentation(*a, ca)
- : 0;
- }
- case BCH_LRU_stripes:
- return k.k->type == KEY_TYPE_stripe
- ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
- : 0;
- default:
- BUG();
- }
-}
-
-static int bch2_check_lru_key(struct btree_trans *trans,
- struct btree_iter *lru_iter,
- struct bkey_s_c lru_k,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- struct bbpos bp = lru_pos_to_bp(lru_k);
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
- int ret = bkey_err(k);
- if (ret)
- goto err;
-
- enum bch_lru_type type = lru_type(lru_k);
- u64 idx = bkey_lru_type_idx(c, type, k);
-
- if (lru_pos_time(lru_k.k->p) != idx) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
- if (ret)
- goto err;
-
- if (fsck_err(trans, lru_entry_bad,
- "incorrect lru entry: lru %s time %llu\n"
- "%s\n"
- "for %s",
- bch2_lru_types[type],
- lru_pos_time(lru_k.k->p),
- (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
- (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-int bch2_check_lrus(struct bch_fs *c)
-{
- struct bkey_buf last_flushed;
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_lru_key(trans, &iter, k, &last_flushed)));
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch_err_fn(c, ret);
- return ret;
-
-}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
deleted file mode 100644
index 8abd0aa2083a..000000000000
--- a/fs/bcachefs/lru.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_H
-#define _BCACHEFS_LRU_H
-
-static inline u64 lru_pos_id(struct bpos pos)
-{
- return pos.inode >> LRU_TIME_BITS;
-}
-
-static inline u64 lru_pos_time(struct bpos pos)
-{
- return pos.inode & ~(~0ULL << LRU_TIME_BITS);
-}
-
-static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
-{
- struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
-
- EBUG_ON(time > LRU_TIME_MAX);
- EBUG_ON(lru_pos_id(pos) != lru_id);
- EBUG_ON(lru_pos_time(pos) != time);
- EBUG_ON(pos.offset != dev_bucket);
-
- return pos;
-}
-
-static inline enum bch_lru_type lru_type(struct bkey_s_c l)
-{
- u16 lru_id = l.k->p.inode >> 48;
-
- switch (lru_id) {
- case BCH_LRU_BUCKET_FRAGMENTATION:
- return BCH_LRU_fragmentation;
- case BCH_LRU_STRIPE_FRAGMENTATION:
- return BCH_LRU_stripes;
- default:
- return BCH_LRU_read;
- }
-}
-
-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
-void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
-
-#define bch2_bkey_ops_lru ((struct bkey_ops) { \
- .key_validate = bch2_lru_validate, \
- .val_to_text = bch2_lru_to_text, \
- .min_val_size = 8, \
-})
-
-int bch2_lru_del(struct btree_trans *, u16, u64, u64);
-int bch2_lru_set(struct btree_trans *, u16, u64, u64);
-int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
-
-static inline int bch2_lru_change(struct btree_trans *trans,
- u16 lru_id, u64 dev_bucket,
- u64 old_time, u64 new_time)
-{
- return old_time != new_time
- ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
- : 0;
-}
-
-struct bkey_buf;
-int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
-
-int bch2_check_lrus(struct bch_fs *);
-
-#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
deleted file mode 100644
index b7392ad8e41f..000000000000
--- a/fs/bcachefs/lru_format.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_LRU_FORMAT_H
-#define _BCACHEFS_LRU_FORMAT_H
-
-struct bch_lru {
- struct bch_val v;
- __le64 idx;
-} __packed __aligned(8);
-
-#define BCH_LRU_TYPES() \
- x(read) \
- x(fragmentation) \
- x(stripes)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
- BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
-#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
-
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
-#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
deleted file mode 100644
index 0ea9f30803a2..000000000000
--- a/fs/bcachefs/mean_and_variance.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions for incremental mean and variance.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * Copyright © 2022 Daniel B. Hill
- *
- * Author: Daniel B. Hill <daniel@gluo.nz>
- *
- * Description:
- *
- * This is includes some incremental algorithms for mean and variance calculation
- *
- * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- *
- * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
- *
- * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
- *
- * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
- * is deferred to these functions for performance reasons.
- *
- * see lib/math/mean_and_variance_test.c for examples of usage.
- *
- * DO NOT access the mean and variance fields of the weighted variants directly.
- * DO NOT change the weight after calling update.
- */
-
-#include <linux/bug.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-#include <linux/module.h>
-
-#include "mean_and_variance.h"
-
-u128_u u128_div(u128_u n, u64 d)
-{
- u128_u r;
- u64 rem;
- u64 hi = u128_hi(n);
- u64 lo = u128_lo(n);
- u64 h = hi & ((u64) U32_MAX << 32);
- u64 l = (hi & (u64) U32_MAX) << 32;
-
- r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64);
- r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32));
- r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
- return r;
-}
-EXPORT_SYMBOL_GPL(u128_div);
-
-/**
- * mean_and_variance_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- */
-s64 mean_and_variance_get_mean(struct mean_and_variance s)
-{
- return s.n ? div64_u64(s.sum, s.n) : 0;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
-
-/**
- * mean_and_variance_get_variance() - get variance from @s1
- * @s1: mean and variance number of samples and sums
- *
- * see linked pdf equation 12.
- */
-u64 mean_and_variance_get_variance(struct mean_and_variance s1)
-{
- if (s1.n) {
- u128_u s2 = u128_div(s1.sum_squares, s1.n);
- u64 s3 = abs(mean_and_variance_get_mean(s1));
-
- return u128_lo(u128_sub(s2, u128_square(s3)));
- } else {
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
-
-/**
- * mean_and_variance_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- */
-u32 mean_and_variance_get_stddev(struct mean_and_variance s)
-{
- return int_sqrt64(mean_and_variance_get_variance(s));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
-
-/**
- * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s: mean and variance number of samples and their sums
- * @x: new value to include in the &mean_and_variance_weighted
- * @initted: caller must track whether this is the first use or not
- * @weight: ewma weight
- *
- * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
- * values are stored bitshifted for performance and added precision.
- */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
- s64 x, bool initted, u8 weight)
-{
- // previous weighted variance.
- u8 w = weight;
- u64 var_w0 = s->variance;
- // new value weighted.
- s64 x_w = x << w;
- s64 diff_w = x_w - s->mean;
- s64 diff = fast_divpow2(diff_w, w);
- // new mean weighted.
- s64 u_w1 = s->mean + diff;
-
- if (!initted) {
- s->mean = x_w;
- s->variance = 0;
- } else {
- s->mean = u_w1;
- s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
- }
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
-
-/**
- * mean_and_variance_weighted_get_mean() - get mean from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
- u8 weight)
-{
- return fast_divpow2(s.mean, weight);
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
-
-/**
- * mean_and_variance_weighted_get_variance() -- get variance from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
- u8 weight)
-{
- // always positive don't need fast divpow2
- return s.variance >> weight;
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
-
-/**
- * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
- * @s: mean and variance number of samples and their sums
- * @weight: ewma weight
- */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
- u8 weight)
-{
- return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
-}
-EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
deleted file mode 100644
index 47e4a3c3d26e..000000000000
--- a/fs/bcachefs/mean_and_variance.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef MEAN_AND_VARIANCE_H_
-#define MEAN_AND_VARIANCE_H_
-
-#include <linux/types.h>
-#include <linux/limits.h>
-#include <linux/math.h>
-#include <linux/math64.h>
-
-#define SQRT_U64_MAX 4294967295ULL
-
-/*
- * u128_u: u128 user mode, because not all architectures support a real int128
- * type
- *
- * We don't use this version in userspace, because in userspace we link with
- * Rust and rustc has issues with u128.
- */
-
-#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
-
-typedef struct {
- unsigned __int128 v;
-} __aligned(16) u128_u;
-
-static inline u128_u u64_to_u128(u64 a)
-{
- return (u128_u) { .v = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
- return a.v;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
- return a.v >> 64;
-}
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
- a.v += b.v;
- return a;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
- a.v -= b.v;
- return a;
-}
-
-static inline u128_u u128_shl(u128_u a, s8 shift)
-{
- a.v <<= shift;
- return a;
-}
-
-static inline u128_u u128_square(u64 a)
-{
- u128_u b = u64_to_u128(a);
-
- b.v *= b.v;
- return b;
-}
-
-#else
-
-typedef struct {
- u64 hi, lo;
-} __aligned(16) u128_u;
-
-/* conversions */
-
-static inline u128_u u64_to_u128(u64 a)
-{
- return (u128_u) { .lo = a };
-}
-
-static inline u64 u128_lo(u128_u a)
-{
- return a.lo;
-}
-
-static inline u64 u128_hi(u128_u a)
-{
- return a.hi;
-}
-
-/* arithmetic */
-
-static inline u128_u u128_add(u128_u a, u128_u b)
-{
- u128_u c;
-
- c.lo = a.lo + b.lo;
- c.hi = a.hi + b.hi + (c.lo < a.lo);
- return c;
-}
-
-static inline u128_u u128_sub(u128_u a, u128_u b)
-{
- u128_u c;
-
- c.lo = a.lo - b.lo;
- c.hi = a.hi - b.hi - (c.lo > a.lo);
- return c;
-}
-
-static inline u128_u u128_shl(u128_u i, s8 shift)
-{
- u128_u r;
-
- r.lo = i.lo << (shift & 63);
- if (shift < 64)
- r.hi = (i.hi << (shift & 63)) | (i.lo >> (-shift & 63));
- else {
- r.hi = i.lo << (-shift & 63);
- r.lo = 0;
- }
- return r;
-}
-
-static inline u128_u u128_square(u64 i)
-{
- u128_u r;
- u64 h = i >> 32, l = i & U32_MAX;
-
- r = u128_shl(u64_to_u128(h*h), 64);
- r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
- r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
- r = u128_add(r, u64_to_u128(l*l));
- return r;
-}
-
-#endif
-
-static inline u128_u u64s_to_u128(u64 hi, u64 lo)
-{
- u128_u c = u64_to_u128(hi);
-
- c = u128_shl(c, 64);
- c = u128_add(c, u64_to_u128(lo));
- return c;
-}
-
-u128_u u128_div(u128_u n, u64 d);
-
-struct mean_and_variance {
- s64 n;
- s64 sum;
- u128_u sum_squares;
-};
-
-/* expontentially weighted variant */
-struct mean_and_variance_weighted {
- s64 mean;
- u64 variance;
-};
-
-/**
- * fast_divpow2() - fast approximation for n / (1 << d)
- * @n: numerator
- * @d: the power of 2 denominator.
- *
- * note: this rounds towards 0.
- */
-static inline s64 fast_divpow2(s64 n, u8 d)
-{
- return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
-}
-
-/**
- * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
- * and return it.
- * @s1: the mean_and_variance to update.
- * @v1: the new sample.
- *
- * see linked pdf equation 12.
- */
-static inline void
-mean_and_variance_update(struct mean_and_variance *s, s64 v)
-{
- s->n++;
- s->sum += v;
- s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
-}
-
-s64 mean_and_variance_get_mean(struct mean_and_variance s);
-u64 mean_and_variance_get_variance(struct mean_and_variance s1);
-u32 mean_and_variance_get_stddev(struct mean_and_variance s);
-
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
- s64 v, bool initted, u8 weight);
-
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
- u8 weight);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
- u8 weight);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
- u8 weight);
-
-#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
deleted file mode 100644
index e9d9c0212e44..000000000000
--- a/fs/bcachefs/mean_and_variance_test.c
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <kunit/test.h>
-
-#include "mean_and_variance.h"
-
-#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
-
-static void mean_and_variance_basic_test(struct kunit *test)
-{
- struct mean_and_variance s = {};
-
- mean_and_variance_update(&s, 2);
- mean_and_variance_update(&s, 2);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
- KUNIT_EXPECT_EQ(test, s.n, 2);
-
- mean_and_variance_update(&s, 4);
- mean_and_variance_update(&s, 4);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
- KUNIT_EXPECT_EQ(test, s.n, 4);
-}
-
-/*
- * Test values computed using a spreadsheet from the psuedocode at the bottom:
- * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
- */
-
-static void mean_and_variance_weighted_test(struct kunit *test)
-{
- struct mean_and_variance_weighted s = { };
-
- mean_and_variance_weighted_update(&s, 10, false, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
- mean_and_variance_weighted_update(&s, 20, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
- mean_and_variance_weighted_update(&s, 30, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-
- s = (struct mean_and_variance_weighted) { };
-
- mean_and_variance_weighted_update(&s, -10, false, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
-
- mean_and_variance_weighted_update(&s, -20, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
-
- mean_and_variance_weighted_update(&s, -30, true, 2);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
-}
-
-static void mean_and_variance_weighted_advanced_test(struct kunit *test)
-{
- struct mean_and_variance_weighted s = { };
- bool initted = false;
- s64 i;
-
- for (i = 10; i <= 100; i += 10) {
- mean_and_variance_weighted_update(&s, i, initted, 8);
- initted = true;
- }
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-
- s = (struct mean_and_variance_weighted) { };
- initted = false;
-
- for (i = -10; i >= -100; i -= 10) {
- mean_and_variance_weighted_update(&s, i, initted, 8);
- initted = true;
- }
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
-}
-
-static void do_mean_and_variance_test(struct kunit *test,
- s64 initial_value,
- s64 initial_n,
- s64 n,
- unsigned weight,
- s64 *data,
- s64 *mean,
- s64 *stddev,
- s64 *weighted_mean,
- s64 *weighted_stddev)
-{
- struct mean_and_variance mv = {};
- struct mean_and_variance_weighted vw = { };
-
- for (unsigned i = 0; i < initial_n; i++) {
- mean_and_variance_update(&mv, initial_value);
- mean_and_variance_weighted_update(&vw, initial_value, false, weight);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
- }
-
- for (unsigned i = 0; i < n; i++) {
- mean_and_variance_update(&mv, data[i]);
- mean_and_variance_weighted_update(&vw, data[i], true, weight);
-
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
- KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
- }
-
- KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
-}
-
-/* Test behaviour with a single outlier, then back to steady state: */
-static void mean_and_variance_test_1(struct kunit *test)
-{
- s64 d[] = { 100, 10, 10, 10, 10, 10, 10 };
- s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 };
- s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 };
- s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 };
- s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-/* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_2(struct kunit *test)
-{
- s64 d[] = { 100, 100, 100, 100, 100 };
- s64 mean[] = { 22, 32, 40, 46, 50 };
- s64 stddev[] = { 32, 39, 42, 44, 45 };
- s64 weighted_mean[] = { 32, 49, 61, 71, 78 };
- s64 weighted_stddev[] = { 38, 44, 44, 41, 38 };
-
- do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
- d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
-static void mean_and_variance_fast_divpow2(struct kunit *test)
-{
- s64 i;
- u8 d;
-
- for (i = 0; i < 100; i++) {
- d = 0;
- KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
- KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
- for (d = 1; d < 32; d++) {
- KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
- div_u64(i, 1 << d), "%lld %u", i, d);
- KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
- div_u64(i, 1 << d), "%lld %u", -i, d);
- }
- }
-}
-
-static void mean_and_variance_u128_basic_test(struct kunit *test)
-{
- u128_u a = u64s_to_u128(0, U64_MAX);
- u128_u a1 = u64s_to_u128(0, 1);
- u128_u b = u64s_to_u128(1, 0);
- u128_u c = u64s_to_u128(0, 1LLU << 63);
- u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
- KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
-
- KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
- KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
-
- KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
-
- KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
- KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
-}
-
-static struct kunit_case mean_and_variance_test_cases[] = {
- KUNIT_CASE(mean_and_variance_fast_divpow2),
- KUNIT_CASE(mean_and_variance_u128_basic_test),
- KUNIT_CASE(mean_and_variance_basic_test),
- KUNIT_CASE(mean_and_variance_weighted_test),
- KUNIT_CASE(mean_and_variance_weighted_advanced_test),
- KUNIT_CASE(mean_and_variance_test_1),
- KUNIT_CASE(mean_and_variance_test_2),
- {}
-};
-
-static struct kunit_suite mean_and_variance_test_suite = {
- .name = "mean and variance tests",
- .test_cases = mean_and_variance_test_cases
-};
-
-kunit_test_suite(mean_and_variance_test_suite);
-
-MODULE_AUTHOR("Daniel B. Hill");
-MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
-MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
deleted file mode 100644
index f296cce95338..000000000000
--- a/fs/bcachefs/migrate.c
+++ /dev/null
@@ -1,277 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Code for moving data off a device.
- */
-
-#include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "ec.h"
-#include "errcode.h"
-#include "extents.h"
-#include "io_write.h"
-#include "journal.h"
-#include "keylist.h"
-#include "migrate.h"
-#include "move.h"
-#include "progress.h"
-#include "replicas.h"
-#include "super-io.h"
-
-static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
- unsigned dev_idx, unsigned flags, bool metadata)
-{
- unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
- unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
- unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
- unsigned nr_good;
-
- bch2_bkey_drop_device(k, dev_idx);
-
- nr_good = bch2_bkey_durability(c, k.s_c);
- if ((!nr_good && !(flags & lost)) ||
- (nr_good < replicas && !(flags & degraded)))
- return bch_err_throw(c, remove_would_lose_data);
-
- return 0;
-}
-
-static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b, unsigned dev_idx, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf k;
-
- bch2_bkey_buf_init(&k);
- bch2_bkey_buf_copy(&k, c, &b->key);
-
- int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
- bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
-
- bch_err_fn(c, ret);
- bch2_bkey_buf_exit(&k, c);
- return ret;
-}
-
-static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- unsigned dev_idx,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *n;
- int ret;
-
- if (!bch2_bkey_has_device_c(k, dev_idx))
- return 0;
-
- n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
- if (ret)
- return ret;
-
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, bkey_i_to_s(n));
-
- /*
- * Since we're not inserting through an extent iterator
- * (BTREE_ITER_all_snapshots iterators aren't extent iterators),
- * we aren't using the extent overwrite path to delete, we're
- * just using the normal key deletion path:
- */
- if (bkey_deleted(&n->k))
- n->k.size = 0;
- return 0;
-}
-
-static int bch2_dev_btree_drop_key(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- unsigned dev_idx,
- struct bkey_buf *last_flushed,
- unsigned flags)
-{
- struct btree_iter iter;
- struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
- int ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
-
- ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_dev_usrdata_drop(struct bch_fs *c,
- struct progress_indicator_state *progress,
- unsigned dev_idx, unsigned flags)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- enum btree_id id;
- int ret = 0;
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- if (!btree_type_has_ptrs(id))
- continue;
-
- ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
- }));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
-
- return ret;
-}
-
-static int bch2_dev_metadata_drop(struct bch_fs *c,
- struct progress_indicator_state *progress,
- unsigned dev_idx, unsigned flags)
-{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct closure cl;
- struct btree *b;
- struct bkey_buf k;
- unsigned id;
- int ret;
-
- /* don't handle this yet: */
- if (flags & BCH_FORCE_IF_METADATA_LOST)
- return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
-
- trans = bch2_trans_get(c);
- bch2_bkey_buf_init(&k);
- closure_init_stack(&cl);
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
- BTREE_ITER_prefetch);
-retry:
- ret = 0;
- while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(trans, &iter)) &&
- !(ret = PTR_ERR_OR_ZERO(b))) {
- bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
-
- if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
- goto next;
-
- ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- ret = 0;
- continue;
- }
-
- if (ret)
- break;
-next:
- bch2_btree_iter_next_node(trans, &iter);
- }
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- goto err;
- }
-
- bch2_btree_interior_updates_flush(c);
- ret = 0;
-err:
- bch2_bkey_buf_exit(&k, c);
- bch2_trans_put(trans);
-
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- return ret;
-}
-
-static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
- struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
- unsigned flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
- last_flushed);
- int ret = bkey_err(k);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- return 0;
- if (ret)
- return ret;
-
- if (!k.k || !bch2_bkey_has_device_c(k, dev_idx))
- goto out;
-
- /*
- * XXX: pass flags arg to invalidate_stripe_to_dev and handle it
- * properly
- */
-
- if (bkey_is_btree_ptr(k.k))
- ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags);
- else if (k.k->type == KEY_TYPE_stripe)
- ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags);
- else
- ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
- struct btree_trans *trans = bch2_trans_get(c);
-
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
- POS(dev_idx, 0),
- POS(dev_idx, U64_MAX), 0, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- if (k.k->type != KEY_TYPE_backpointer)
- continue;
-
- data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
- &last_flushed, flags);
-
- }));
-
- bch2_bkey_buf_exit(&last_flushed, trans->c);
- bch2_trans_put(trans);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
-{
- struct progress_indicator_state progress;
- bch2_progress_init(&progress, c,
- BIT_ULL(BTREE_ID_extents)|
- BIT_ULL(BTREE_ID_reflink));
-
- return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
- bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
-}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
deleted file mode 100644
index 30018140711b..000000000000
--- a/fs/bcachefs/migrate.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MIGRATE_H
-#define _BCACHEFS_MIGRATE_H
-
-int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
-int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
deleted file mode 100644
index eec591e947bd..000000000000
--- a/fs/bcachefs/move.c
+++ /dev/null
@@ -1,1494 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_write_buffer.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "replicas.h"
-#include "snapshot.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/ioprio.h>
-#include <linux/kthread.h>
-
-const char * const bch2_data_ops_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_DATA_OPS()
-#undef x
- NULL
-};
-
-struct evacuate_bucket_arg {
- struct bpos bucket;
- int gen;
- struct data_update_opts data_opts;
-};
-
-static bool evacuate_bucket_pred(struct bch_fs *, void *,
- enum btree_id, struct bkey_s_c,
- struct bch_io_opts *,
- struct data_update_opts *);
-
-static noinline void
-trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_io_move(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- trace_io_move_read(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-static noinline void
-trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts,
- move_pred_fn pred, void *_arg, bool p)
-{
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "%ps: %u", pred, p);
-
- if (pred == evacuate_bucket_pred) {
- struct evacuate_bucket_arg *arg = _arg;
- prt_printf(&buf, " gen=%u", arg->gen);
- }
-
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_io_move_pred(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-static noinline void
-trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
-{
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "bucket: ");
- bch2_bpos_to_text(&buf, bucket);
- prt_printf(&buf, " gen: %i\n", gen);
-
- trace_io_move_evacuate_bucket(c, buf.buf);
- printbuf_exit(&buf);
-}
-
-struct moving_io {
- struct list_head read_list;
- struct list_head io_list;
- struct move_bucket *b;
- struct closure cl;
- bool read_completed;
-
- unsigned read_sectors;
- unsigned write_sectors;
-
- struct data_update write;
-};
-
-static void move_free(struct moving_io *io)
-{
- struct moving_context *ctxt = io->write.ctxt;
-
- if (io->b)
- atomic_dec(&io->b->count);
-
- mutex_lock(&ctxt->lock);
- list_del(&io->io_list);
- wake_up(&ctxt->wait);
- mutex_unlock(&ctxt->lock);
-
- if (!io->write.data_opts.scrub) {
- bch2_data_update_exit(&io->write);
- } else {
- bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
- kfree(io->write.bvecs);
- }
- kfree(io);
-}
-
-static void move_write_done(struct bch_write_op *op)
-{
- struct moving_io *io = container_of(op, struct moving_io, write.op);
- struct bch_fs *c = op->c;
- struct moving_context *ctxt = io->write.ctxt;
-
- if (op->error) {
- if (trace_io_move_write_fail_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_write_op_to_text(&buf, op);
- trace_io_move_write_fail(c, buf.buf);
- printbuf_exit(&buf);
- }
- this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
-
- ctxt->write_error = true;
- }
-
- atomic_sub(io->write_sectors, &ctxt->write_sectors);
- atomic_dec(&ctxt->write_ios);
- move_free(io);
- closure_put(&ctxt->cl);
-}
-
-static void move_write(struct moving_io *io)
-{
- struct bch_fs *c = io->write.op.c;
- struct moving_context *ctxt = io->write.ctxt;
- struct bch_read_bio *rbio = &io->write.rbio;
-
- if (ctxt->stats) {
- if (rbio->bio.bi_status)
- atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
- &ctxt->stats->sectors_error_uncorrected);
- else if (rbio->saw_error)
- atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
- &ctxt->stats->sectors_error_corrected);
- }
-
- /*
- * If the extent has been bitrotted, we're going to have to give it a
- * new checksum in order to move it - but the poison bit will ensure
- * that userspace still gets the appropriate error.
- */
- if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
- (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
-
- rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
- nonce, &rbio->bio);
- rbio->ret = 0;
- }
-
- if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
- move_free(io);
- return;
- }
-
- if (trace_io_move_write_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
- trace_io_move_write(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- closure_get(&io->write.ctxt->cl);
- atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- atomic_inc(&io->write.ctxt->write_ios);
-
- bch2_data_update_read_done(&io->write);
-}
-
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
-{
- struct moving_io *io =
- list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
-
- return io && io->read_completed ? io : NULL;
-}
-
-static void move_read_endio(struct bio *bio)
-{
- struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
- struct moving_context *ctxt = io->write.ctxt;
-
- atomic_sub(io->read_sectors, &ctxt->read_sectors);
- atomic_dec(&ctxt->read_ios);
- io->read_completed = true;
-
- wake_up(&ctxt->wait);
- closure_put(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
-{
- struct moving_io *io;
-
- while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
- bch2_trans_unlock_long(ctxt->trans);
- list_del(&io->read_list);
- move_write(io);
- }
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
- unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
-
- move_ctxt_wait_event(ctxt,
- !atomic_read(&ctxt->write_sectors) ||
- atomic_read(&ctxt->write_sectors) != sectors_pending);
-}
-
-void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
-{
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
- bch2_trans_unlock_long(ctxt->trans);
- closure_sync(&ctxt->cl);
-}
-
-void bch2_moving_ctxt_exit(struct moving_context *ctxt)
-{
- struct bch_fs *c = ctxt->trans->c;
-
- bch2_moving_ctxt_flush_all(ctxt);
-
- EBUG_ON(atomic_read(&ctxt->write_sectors));
- EBUG_ON(atomic_read(&ctxt->write_ios));
- EBUG_ON(atomic_read(&ctxt->read_sectors));
- EBUG_ON(atomic_read(&ctxt->read_ios));
-
- mutex_lock(&c->moving_context_lock);
- list_del(&ctxt->list);
- mutex_unlock(&c->moving_context_lock);
-
- /*
- * Generally, releasing a transaction within a transaction restart means
- * an unhandled transaction restart: but this can happen legitimately
- * within the move code, e.g. when bch2_move_ratelimit() tells us to
- * exit before we've retried
- */
- bch2_trans_begin(ctxt->trans);
- bch2_trans_put(ctxt->trans);
- memset(ctxt, 0, sizeof(*ctxt));
-}
-
-void bch2_moving_ctxt_init(struct moving_context *ctxt,
- struct bch_fs *c,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc)
-{
- memset(ctxt, 0, sizeof(*ctxt));
-
- ctxt->trans = bch2_trans_get(c);
- ctxt->fn = (void *) _RET_IP_;
- ctxt->rate = rate;
- ctxt->stats = stats;
- ctxt->wp = wp;
- ctxt->wait_on_copygc = wait_on_copygc;
-
- closure_init_stack(&ctxt->cl);
-
- mutex_init(&ctxt->lock);
- INIT_LIST_HEAD(&ctxt->reads);
- INIT_LIST_HEAD(&ctxt->ios);
- init_waitqueue_head(&ctxt->wait);
-
- mutex_lock(&c->moving_context_lock);
- list_add(&ctxt->list, &c->moving_context_list);
- mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
-{
- trace_move_data(c, stats);
-}
-
-void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
-{
- memset(stats, 0, sizeof(*stats));
- stats->data_type = BCH_DATA_user;
- scnprintf(stats->name, sizeof(stats->name), "%s", name);
-}
-
-int bch2_move_extent(struct moving_context *ctxt,
- struct move_bucket *bucket_in_flight,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_io_opts io_opts,
- struct data_update_opts data_opts)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- int ret = -ENOMEM;
-
- if (trace_io_move_enabled())
- trace_io_move2(c, k, &io_opts, &data_opts);
- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
-
- if (ctxt->stats)
- ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
-
- bch2_data_update_opts_normalize(k, &data_opts);
-
- if (!data_opts.rewrite_ptrs &&
- !data_opts.extra_replicas &&
- !data_opts.scrub) {
- if (data_opts.kill_ptrs)
- return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
- return 0;
- }
-
- struct moving_io *io = allocate_dropping_locks(trans, ret,
- kzalloc(sizeof(struct moving_io), _gfp));
- if (!io)
- goto err;
-
- if (ret)
- goto err_free;
-
- INIT_LIST_HEAD(&io->io_list);
- io->write.ctxt = ctxt;
- io->read_sectors = k.k->size;
- io->write_sectors = k.k->size;
-
- if (!data_opts.scrub) {
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
- &io_opts, data_opts, iter->btree_id, k);
- if (ret)
- goto err_free;
-
- io->write.op.end_io = move_write_done;
- } else {
- bch2_bkey_buf_init(&io->write.k);
- bch2_bkey_buf_reassemble(&io->write.k, c, k);
-
- io->write.op.c = c;
- io->write.data_opts = data_opts;
-
- bch2_trans_unlock(trans);
-
- ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
- if (ret)
- goto err_free;
- }
-
- io->write.rbio.bio.bi_end_io = move_read_endio;
- io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
-
- if (ctxt->stats) {
- atomic64_inc(&ctxt->stats->keys_moved);
- atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
- }
-
- if (bucket_in_flight) {
- io->b = bucket_in_flight;
- atomic_inc(&io->b->count);
- }
-
- if (trace_io_move_read_enabled())
- trace_io_move_read2(c, k);
-
- mutex_lock(&ctxt->lock);
- atomic_add(io->read_sectors, &ctxt->read_sectors);
- atomic_inc(&ctxt->read_ios);
-
- list_add_tail(&io->read_list, &ctxt->reads);
- list_add_tail(&io->io_list, &ctxt->ios);
- mutex_unlock(&ctxt->lock);
-
- /*
- * dropped by move_read_endio() - guards against use after free of
- * ctxt when doing wakeup
- */
- closure_get(&ctxt->cl);
- __bch2_read_extent(trans, &io->write.rbio,
- io->write.rbio.bio.bi_iter,
- bkey_start_pos(k.k),
- iter->btree_id, k, 0,
- NULL,
- BCH_READ_last_fragment,
- data_opts.scrub ? data_opts.read_dev : -1);
- return 0;
-err_free:
- kfree(io);
-err:
- if (bch2_err_matches(ret, EROFS) ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- count_event(c, io_move_start_fail);
-
- if (trace_io_move_start_fail_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, ": ");
- prt_str(&buf, bch2_err_str(ret));
- trace_io_move_start_fail(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- if (bch2_err_matches(ret, BCH_ERR_data_update_done))
- return 0;
- return ret;
-}
-
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
- struct per_snapshot_io_opts *io_opts,
- struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
- struct btree_iter *extent_iter,
- struct bkey_s_c extent_k)
-{
- struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
- struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
- int ret = 0;
-
- if (extent_iter->min_depth)
- return opts_ret;
-
- if (extent_k.k->type == KEY_TYPE_reflink_v)
- goto out;
-
- if (io_opts->cur_inum != extent_pos.inode) {
- io_opts->d.nr = 0;
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
- BTREE_ITER_all_snapshots, k, ({
- if (k.k->p.offset != extent_pos.inode)
- break;
-
- if (!bkey_is_inode(k.k))
- continue;
-
- struct bch_inode_unpacked inode;
- _ret3 = bch2_inode_unpack(k, &inode);
- if (_ret3)
- break;
-
- struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
- bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
-
- darray_push(&io_opts->d, e);
- }));
- io_opts->cur_inum = extent_pos.inode;
- }
-
- ret = ret ?: trans_was_restarted(trans, restart_count);
- if (ret)
- return ERR_PTR(ret);
-
- if (extent_k.k->p.snapshot)
- darray_for_each(io_opts->d, i)
- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
- opts_ret = &i->io_opts;
- break;
- }
-out:
- ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
- if (ret)
- return ERR_PTR(ret);
- return opts_ret;
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct btree_iter *extent_iter,
- struct bkey_s_c extent_k)
-{
- struct bch_fs *c = trans->c;
-
- *io_opts = bch2_opts_to_inode_opts(c->opts);
-
- /* reflink btree? */
- if (!extent_k.k->p.inode)
- goto out;
-
- struct btree_iter inode_iter;
- struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
- SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
- BTREE_ITER_cached);
- int ret = bkey_err(inode_k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- if (!ret && bkey_is_inode(inode_k.k)) {
- struct bch_inode_unpacked inode;
- bch2_inode_unpack(inode_k, &inode);
- bch2_inode_opts_get(io_opts, c, &inode);
- }
- bch2_trans_iter_exit(trans, &inode_iter);
- /* seem to be spinning here? */
-out:
- return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
-}
-
-int bch2_move_ratelimit(struct moving_context *ctxt)
-{
- struct bch_fs *c = ctxt->trans->c;
- bool is_kthread = current->flags & PF_KTHREAD;
- u64 delay;
-
- if (ctxt->wait_on_copygc && c->copygc_running) {
- bch2_moving_ctxt_flush_all(ctxt);
- wait_event_killable(c->copygc_running_wq,
- !c->copygc_running ||
- (is_kthread && kthread_should_stop()));
- }
-
- do {
- delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
-
- if (is_kthread && kthread_should_stop())
- return 1;
-
- if (delay)
- move_ctxt_wait_event_timeout(ctxt,
- freezing(current) ||
- (is_kthread && kthread_should_stop()),
- delay);
-
- if (unlikely(freezing(current))) {
- bch2_moving_ctxt_flush_all(ctxt);
- try_to_freeze();
- }
- } while (delay);
-
- /*
- * XXX: these limits really ought to be per device, SSDs and hard drives
- * will want different limits
- */
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
- atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
- atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
- atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
-
- return 0;
-}
-
-/*
- * Move requires non extents iterators, and there's also no need for it to
- * signal indirect_extent_missing_error:
- */
-static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_reflink_p p)
-{
- if (unlikely(REFLINK_P_ERROR(p.v)))
- return bkey_s_c_null;
-
- struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
-
- bch2_trans_iter_init(trans, iter,
- BTREE_ID_reflink, reflink_pos,
- BTREE_ITER_not_extents);
-
- struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
- if (!k.k || bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
- bch2_trans_iter_exit(trans, iter);
- return bkey_s_c_null;
- }
-
- return k;
-}
-
-int bch2_move_data_btree(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id, unsigned level)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct per_snapshot_io_opts snapshot_io_opts;
- struct bch_io_opts *io_opts;
- struct bkey_buf sk;
- struct btree_iter iter, reflink_iter = {};
- struct bkey_s_c k;
- struct data_update_opts data_opts;
- /*
- * If we're moving a single file, also process reflinked data it points
- * to (this includes propagating changed io_opts from the inode to the
- * extent):
- */
- bool walk_indirect = start.inode == end.inode;
- int ret = 0, ret2;
-
- per_snapshot_io_opts_init(&snapshot_io_opts, c);
- bch2_bkey_buf_init(&sk);
-
- if (ctxt->stats) {
- ctxt->stats->data_type = BCH_DATA_user;
- ctxt->stats->pos = BBPOS(btree_id, start);
- }
-
-retry_root:
- bch2_trans_begin(trans);
-
- if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
- bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
- BTREE_ITER_prefetch|
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
- struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret)
- goto root_err;
-
- if (b != btree_node_root(c, b)) {
- bch2_trans_iter_exit(trans, &iter);
- goto retry_root;
- }
-
- k = bkey_i_to_s_c(&b->key);
-
- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
- iter.pos, &iter, k);
- ret = PTR_ERR_OR_ZERO(io_opts);
- if (ret)
- goto root_err;
-
- memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
- goto out;
-
-
- if (!data_opts.scrub)
- ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
- k.k->p, data_opts.target, 0);
- else
- ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
-
-root_err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- bch2_trans_iter_exit(trans, &iter);
- goto retry_root;
- }
-
- goto out;
- }
-
- bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
- BTREE_ITER_prefetch|
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
-
- if (ctxt->rate)
- bch2_ratelimit_reset(ctxt->rate);
-
- while (!bch2_move_ratelimit(ctxt)) {
- struct btree_iter *extent_iter = &iter;
-
- bch2_trans_begin(trans);
-
- k = bch2_btree_iter_peek(trans, &iter);
- if (!k.k)
- break;
-
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (bkey_gt(bkey_start_pos(k.k), end))
- break;
-
- if (ctxt->stats)
- ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
- if (walk_indirect &&
- k.k->type == KEY_TYPE_reflink_p &&
- REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
- bch2_trans_iter_exit(trans, &reflink_iter);
- k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (!k.k)
- goto next_nondata;
-
- /*
- * XXX: reflink pointers may point to multiple indirect
- * extents, so don't advance past the entire reflink
- * pointer - need to fixup iter->k
- */
- extent_iter = &reflink_iter;
- }
-
- if (!bkey_extent_is_direct_data(k.k))
- goto next_nondata;
-
- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
- iter.pos, extent_iter, k);
- ret = PTR_ERR_OR_ZERO(io_opts);
- if (ret)
- continue;
-
- memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
- goto next;
-
- /*
- * The iterator gets unlocked by __bch2_read_extent - need to
- * save a copy of @k elsewhere:
- */
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- if (!level)
- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
- else if (!data_opts.scrub)
- ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
- k.k->p, data_opts.target, 0);
- else
- ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
-
- if (ret2) {
- if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
- continue;
-
- if (bch2_err_matches(ret2, ENOMEM)) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- continue;
- }
-
- /* XXX signal failure */
- goto next;
- }
-next:
- if (ctxt->stats)
- atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-next_nondata:
- if (!bch2_btree_iter_advance(trans, &iter))
- break;
- }
-out:
- bch2_trans_iter_exit(trans, &reflink_iter);
- bch2_trans_iter_exit(trans, &iter);
- bch2_bkey_buf_exit(&sk, c);
- per_snapshot_io_opts_exit(&snapshot_io_opts);
-
- return ret;
-}
-
-int __bch2_move_data(struct moving_context *ctxt,
- struct bbpos start,
- struct bbpos end,
- move_pred_fn pred, void *arg)
-{
- struct bch_fs *c = ctxt->trans->c;
- enum btree_id id;
- int ret = 0;
-
- for (id = start.btree;
- id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
- id++) {
- ctxt->stats->pos = BBPOS(id, POS_MIN);
-
- if (!btree_type_has_ptrs(id) ||
- !bch2_btree_id_root(c, id)->b)
- continue;
-
- ret = bch2_move_data_btree(ctxt,
- id == start.btree ? start.pos : POS_MIN,
- id == end.btree ? end.pos : POS_MAX,
- pred, arg, id, 0);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-int bch2_move_data(struct bch_fs *c,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
-{
- struct moving_context ctxt;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
-static int __bch2_move_data_phys(struct moving_context *ctxt,
- struct move_bucket *bucket_in_flight,
- unsigned dev,
- u64 bucket_start,
- u64 bucket_end,
- unsigned data_types,
- bool copygc,
- move_pred_fn pred, void *arg)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- bool is_kthread = current->flags & PF_KTHREAD;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_iter iter = {}, bp_iter = {};
- struct bkey_buf sk;
- struct bkey_s_c k;
- struct bkey_buf last_flushed;
- u64 check_mismatch_done = bucket_start;
- int ret = 0;
-
- struct bch_dev *ca = bch2_dev_tryget(c, dev);
- if (!ca)
- return 0;
-
- bucket_end = min(bucket_end, ca->mi.nbuckets);
-
- struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
- struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
-
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
- bch2_bkey_buf_init(&sk);
-
- /*
- * We're not run in a context that handles transaction restarts:
- */
- bch2_trans_begin(trans);
-
- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
-
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "flushing btree write buffer");
- if (ret)
- goto err;
-
- while (!(ret = bch2_move_ratelimit(ctxt))) {
- if (is_kthread && kthread_should_stop())
- break;
-
- bch2_trans_begin(trans);
-
- k = bch2_btree_iter_peek(trans, &bp_iter);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
-
- if (!k.k || bkey_gt(k.k->p, bp_end))
- break;
-
- if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
- while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
- bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
- copygc, &last_flushed);
- }
- continue;
- }
-
- if (k.k->type != KEY_TYPE_backpointer)
- goto next;
-
- struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
-
- if (ctxt->stats)
- ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
-
- if (!(data_types & BIT(bp.v->data_type)))
- goto next;
-
- if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
- goto next;
-
- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
- if (!k.k)
- goto next;
-
- if (!bp.v->level) {
- ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
- if (ret) {
- bch2_trans_iter_exit(trans, &iter);
- continue;
- }
- }
-
- struct data_update_opts data_opts = {};
- bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
-
- if (trace_io_move_pred_enabled())
- trace_io_move_pred2(c, k, &io_opts, &data_opts,
- pred, arg, p);
-
- if (!p) {
- bch2_trans_iter_exit(trans, &iter);
- goto next;
- }
-
- if (data_opts.scrub &&
- !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
- bch2_trans_iter_exit(trans, &iter);
- ret = bch_err_throw(c, device_offline);
- break;
- }
-
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- /* move_extent will drop locks */
- unsigned sectors = bp.v->bucket_len;
-
- if (!bp.v->level)
- ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
- else if (!data_opts.scrub)
- ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
- k.k->p, data_opts.target, 0);
- else
- ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret == -ENOMEM) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- continue;
- }
- if (ret)
- goto err;
-
- if (ctxt->stats)
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
-next:
- bch2_btree_iter_advance(trans, &bp_iter);
- }
-
- while (check_mismatch_done < bucket_end)
- bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
- copygc, &last_flushed);
-err:
- bch2_trans_iter_exit(trans, &bp_iter);
- bch2_bkey_buf_exit(&sk, c);
- bch2_bkey_buf_exit(&last_flushed, c);
- bch2_dev_put(ca);
- return ret;
-}
-
-int bch2_move_data_phys(struct bch_fs *c,
- unsigned dev,
- u64 start,
- u64 end,
- unsigned data_types,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
-{
- struct moving_context ctxt;
-
- bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- if (ctxt.stats) {
- ctxt.stats->phys = true;
- ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
- }
-
- int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
- data_types, false, pred, arg);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
-static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
- enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct evacuate_bucket_arg *arg = _arg;
-
- *data_opts = arg->data_opts;
-
- unsigned i = 0;
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (ptr->dev == arg->bucket.inode &&
- (arg->gen < 0 || arg->gen == ptr->gen) &&
- !ptr->cached)
- data_opts->rewrite_ptrs |= BIT(i);
- i++;
- }
-
- return data_opts->rewrite_ptrs != 0;
-}
-
-int bch2_evacuate_bucket(struct moving_context *ctxt,
- struct move_bucket *bucket_in_flight,
- struct bpos bucket, int gen,
- struct data_update_opts data_opts)
-{
- struct bch_fs *c = ctxt->trans->c;
- struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
-
- count_event(c, io_move_evacuate_bucket);
- if (trace_io_move_evacuate_bucket_enabled())
- trace_io_move_evacuate_bucket2(c, bucket, gen);
-
- return __bch2_move_data_phys(ctxt, bucket_in_flight,
- bucket.inode,
- bucket.offset,
- bucket.offset + 1,
- ~0,
- true,
- evacuate_bucket_pred, &arg);
-}
-
-typedef bool (*move_btree_pred)(struct bch_fs *, void *,
- struct btree *, struct bch_io_opts *,
- struct data_update_opts *);
-
-static int bch2_move_btree(struct bch_fs *c,
- struct bbpos start,
- struct bbpos end,
- move_btree_pred pred, void *arg,
- struct bch_move_stats *stats)
-{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct moving_context ctxt;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct btree *b;
- enum btree_id btree;
- struct data_update_opts data_opts;
- int ret = 0;
-
- bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
- writepoint_ptr(&c->btree_write_point),
- true);
- trans = ctxt.trans;
-
- stats->data_type = BCH_DATA_btree;
-
- for (btree = start.btree;
- btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
- btree ++) {
- stats->pos = BBPOS(btree, POS_MIN);
-
- if (!bch2_btree_id_root(c, btree)->b)
- continue;
-
- bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
- BTREE_ITER_prefetch);
-retry:
- ret = 0;
- while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(trans, &iter)) &&
- !(ret = PTR_ERR_OR_ZERO(b))) {
- if (kthread && kthread_should_stop())
- break;
-
- if ((cmp_int(btree, end.btree) ?:
- bpos_cmp(b->key.k.p, end.pos)) > 0)
- break;
-
- stats->pos = BBPOS(iter.btree_id, iter.pos);
-
- if (!pred(c, arg, b, &io_opts, &data_opts))
- goto next;
-
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-next:
- bch2_btree_iter_next_node(trans, &iter);
- }
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (kthread && kthread_should_stop())
- break;
- }
-
- bch_err_fn(c, ret);
- bch2_moving_ctxt_exit(&ctxt);
- bch2_btree_interior_updates_flush(c);
-
- return ret;
-}
-
-static bool rereplicate_pred(struct bch_fs *c, void *arg,
- enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- unsigned nr_good = bch2_bkey_durability(c, k);
- unsigned replicas = bkey_is_btree_ptr(k.k)
- ? c->opts.metadata_replicas
- : io_opts->data_replicas;
-
- guard(rcu)();
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ptr->cached &&
- (!ca || !ca->mi.durability))
- data_opts->kill_ptrs |= BIT(i);
- i++;
- }
-
- if (!data_opts->kill_ptrs &&
- (!nr_good || nr_good >= replicas))
- return false;
-
- data_opts->target = 0;
- data_opts->extra_replicas = replicas - nr_good;
- data_opts->btree_insert_flags = 0;
- return true;
-}
-
-static bool migrate_pred(struct bch_fs *c, void *arg,
- enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_ioctl_data *op = arg;
- unsigned i = 0;
-
- data_opts->rewrite_ptrs = 0;
- data_opts->target = 0;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
-
- bkey_for_each_ptr(ptrs, ptr) {
- if (ptr->dev == op->migrate.dev)
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
-
- return data_opts->rewrite_ptrs != 0;
-}
-
-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
-/*
- * Ancient versions of bcachefs produced packed formats which could represent
- * keys that the in memory format cannot represent; this checks for those
- * formats so we can get rid of them.
- */
-static bool bformat_needs_redo(struct bkey_format *f)
-{
- for (unsigned i = 0; i < f->nr_fields; i++)
- if (bch2_bkey_format_field_overflows(f, i))
- return true;
-
- return false;
-}
-
-static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- if (b->version_ondisk != c->sb.version ||
- btree_node_need_rewrite(b) ||
- bformat_needs_redo(&b->format)) {
- data_opts->target = 0;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
- return true;
- }
-
- return false;
-}
-
-int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
-{
- int ret;
-
- ret = bch2_move_btree(c,
- BBPOS_MIN,
- BBPOS_MAX,
- rewrite_old_nodes_pred, c, stats);
- if (!ret) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
- c->disk_sb.sb->version_min = c->disk_sb.sb->version;
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- bch_err_fn(c, ret);
- return ret;
-}
-
-static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
- enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- unsigned durability = bch2_bkey_durability(c, k);
- unsigned replicas = bkey_is_btree_ptr(k.k)
- ? c->opts.metadata_replicas
- : io_opts->data_replicas;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned i = 0;
-
- guard(rcu)();
- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
- unsigned d = bch2_extent_ptr_durability(c, &p);
-
- if (d && durability - d >= replicas) {
- data_opts->kill_ptrs |= BIT(i);
- durability -= d;
- }
-
- i++;
- }
-
- return data_opts->kill_ptrs != 0;
-}
-
-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
- io_opts, data_opts);
-}
-
-static bool scrub_pred(struct bch_fs *c, void *_arg,
- enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bch_ioctl_data *arg = _arg;
-
- if (k.k->type != KEY_TYPE_btree_ptr_v2) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == arg->migrate.dev) {
- if (!p.crc.csum_type)
- return false;
- break;
- }
- }
-
- data_opts->scrub = true;
- data_opts->read_dev = arg->migrate.dev;
- return true;
-}
-
-int bch2_data_job(struct bch_fs *c,
- struct bch_move_stats *stats,
- struct bch_ioctl_data op)
-{
- struct bbpos start = BBPOS(op.start_btree, op.start_pos);
- struct bbpos end = BBPOS(op.end_btree, op.end_pos);
- int ret = 0;
-
- if (op.op >= BCH_DATA_OP_NR)
- return -EINVAL;
-
- bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
-
- switch (op.op) {
- case BCH_DATA_OP_scrub:
- /*
- * prevent tests from spuriously failing, make sure we see all
- * btree nodes that need to be repaired
- */
- bch2_btree_interior_updates_flush(c);
-
- ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
- op.scrub.data_types,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- false,
- scrub_pred, &op) ?: ret;
- break;
-
- case BCH_DATA_OP_rereplicate:
- stats->data_type = BCH_DATA_journal;
- ret = bch2_journal_flush_device_pins(&c->journal, -1);
- ret = bch2_move_btree(c, start, end,
- rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- true,
- rereplicate_pred, c) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
- break;
- case BCH_DATA_OP_migrate:
- if (op.migrate.dev >= c->sb.nr_devices)
- return -EINVAL;
-
- stats->data_type = BCH_DATA_journal;
- ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
- ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
- ~0,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- true,
- migrate_pred, &op) ?: ret;
- bch2_btree_interior_updates_flush(c);
- ret = bch2_replicas_gc2(c) ?: ret;
- break;
- case BCH_DATA_OP_rewrite_old_nodes:
- ret = bch2_scan_old_btree_nodes(c, stats);
- break;
- case BCH_DATA_OP_drop_extra_replicas:
- ret = bch2_move_btree(c, start, end,
- drop_extra_replicas_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end, NULL, stats,
- writepoint_hashed((unsigned long) current),
- true,
- drop_extra_replicas_pred, c) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
- break;
- default:
- ret = -EINVAL;
- }
-
- bch2_move_stats_exit(stats, c);
- return ret;
-}
-
-void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
-{
- prt_printf(out, "%s: data type==", stats->name);
- bch2_prt_data_type(out, stats->data_type);
- prt_str(out, " pos=");
- bch2_bbpos_to_text(out, stats->pos);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
- prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
- prt_printf(out, "bytes seen:\t");
- prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
- prt_newline(out);
-
- prt_printf(out, "bytes moved:\t");
- prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
- prt_newline(out);
-
- prt_printf(out, "bytes raced:\t");
- prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
-}
-
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- bch2_move_stats_to_text(out, ctxt->stats);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
- atomic_read(&ctxt->read_ios),
- c->opts.move_ios_in_flight,
- atomic_read(&ctxt->read_sectors),
- c->opts.move_bytes_in_flight >> 9);
-
- prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
- atomic_read(&ctxt->write_ios),
- c->opts.move_ios_in_flight,
- atomic_read(&ctxt->write_sectors),
- c->opts.move_bytes_in_flight >> 9);
-
- printbuf_indent_add(out, 2);
-
- mutex_lock(&ctxt->lock);
- struct moving_io *io;
- list_for_each_entry(io, &ctxt->ios, io_list)
- bch2_data_update_inflight_to_text(out, &io->write);
- mutex_unlock(&ctxt->lock);
-
- printbuf_indent_sub(out, 4);
-}
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct moving_context *ctxt;
-
- mutex_lock(&c->moving_context_lock);
- list_for_each_entry(ctxt, &c->moving_context_list, list)
- bch2_moving_ctxt_to_text(out, c, ctxt);
- mutex_unlock(&c->moving_context_lock);
-}
-
-void bch2_fs_move_init(struct bch_fs *c)
-{
- INIT_LIST_HEAD(&c->moving_context_list);
- mutex_init(&c->moving_context_lock);
-}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
deleted file mode 100644
index 86b80499ac55..000000000000
--- a/fs/bcachefs/move.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_H
-#define _BCACHEFS_MOVE_H
-
-#include "bbpos.h"
-#include "bcachefs_ioctl.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "data_update.h"
-#include "move_types.h"
-
-struct bch_read_bio;
-
-struct moving_context {
- struct btree_trans *trans;
- struct list_head list;
- void *fn;
-
- struct bch_ratelimit *rate;
- struct bch_move_stats *stats;
- struct write_point_specifier wp;
- bool wait_on_copygc;
- bool write_error;
-
- /* For waiting on outstanding reads and writes: */
- struct closure cl;
-
- struct mutex lock;
- struct list_head reads;
- struct list_head ios;
-
- /* in flight sectors: */
- atomic_t read_sectors;
- atomic_t write_sectors;
- atomic_t read_ios;
- atomic_t write_ios;
-
- wait_queue_head_t wait;
-};
-
-#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \
-({ \
- int _ret = 0; \
- while (true) { \
- bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt); \
- \
- if (_cond) \
- break; \
- bch2_trans_unlock_long((_ctxt)->trans); \
- _ret = __wait_event_timeout((_ctxt)->wait, \
- bch2_moving_ctxt_next_pending_write(_ctxt) || \
- (cond_finished = (_cond)), _timeout); \
- if (_ret || ( cond_finished)) \
- break; \
- } \
- _ret; \
-})
-
-#define move_ctxt_wait_event(_ctxt, _cond) \
-do { \
- bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt); \
- \
- if (_cond) \
- break; \
- bch2_trans_unlock_long((_ctxt)->trans); \
- __wait_event((_ctxt)->wait, \
- bch2_moving_ctxt_next_pending_write(_ctxt) || \
- (cond_finished = (_cond))); \
- if (cond_finished) \
- break; \
-} while (1)
-
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
- struct bch_io_opts *, struct data_update_opts *);
-
-extern const char * const bch2_data_ops_strs[];
-
-void bch2_moving_ctxt_exit(struct moving_context *);
-void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
- struct bch_ratelimit *, struct bch_move_stats *,
- struct write_point_specifier, bool);
-struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
-void bch2_moving_ctxt_flush_all(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
-int bch2_move_ratelimit(struct moving_context *);
-
-/* Inodes in different snapshots may have different IO options: */
-struct snapshot_io_opts_entry {
- u32 snapshot;
- struct bch_io_opts io_opts;
-};
-
-struct per_snapshot_io_opts {
- u64 cur_inum;
- struct bch_io_opts fs_io_opts;
- DARRAY(struct snapshot_io_opts_entry) d;
-};
-
-static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
-{
- memset(io_opts, 0, sizeof(*io_opts));
- io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
-}
-
-static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
-{
- darray_exit(&io_opts->d);
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
- struct btree_iter *, struct bkey_s_c);
-
-int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
-
-int bch2_move_extent(struct moving_context *,
- struct move_bucket *,
- struct btree_iter *,
- struct bkey_s_c,
- struct bch_io_opts,
- struct data_update_opts);
-
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
- struct per_snapshot_io_opts *, struct bpos,
- struct btree_iter *, struct bkey_s_c);
-
-int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
- move_pred_fn, void *, enum btree_id, unsigned);
-int __bch2_move_data(struct moving_context *,
- struct bbpos,
- struct bbpos,
- move_pred_fn, void *);
-int bch2_move_data(struct bch_fs *,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool,
- move_pred_fn, void *);
-
-int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
- struct bch_ratelimit *, struct bch_move_stats *,
- struct write_point_specifier, bool,
- move_pred_fn, void *);
-
-int bch2_evacuate_bucket(struct moving_context *,
- struct move_bucket *,
- struct bpos, int,
- struct data_update_opts);
-int bch2_data_job(struct bch_fs *,
- struct bch_move_stats *,
- struct bch_ioctl_data);
-
-void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
-void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, const char *);
-
-void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_move_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
deleted file mode 100644
index c5c62cd600de..000000000000
--- a/fs/bcachefs/move_types.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVE_TYPES_H
-#define _BCACHEFS_MOVE_TYPES_H
-
-#include "bbpos_types.h"
-#include "bcachefs_ioctl.h"
-
-struct bch_move_stats {
- char name[32];
- bool phys;
- enum bch_ioctl_data_event_ret ret;
-
- union {
- struct {
- enum bch_data_type data_type;
- struct bbpos pos;
- };
- struct {
- unsigned dev;
- u64 offset;
- };
- };
-
- atomic64_t keys_moved;
- atomic64_t keys_raced;
- atomic64_t sectors_seen;
- atomic64_t sectors_moved;
- atomic64_t sectors_raced;
- atomic64_t sectors_error_corrected;
- atomic64_t sectors_error_uncorrected;
-};
-
-struct move_bucket_key {
- struct bpos bucket;
- unsigned gen;
-};
-
-struct move_bucket {
- struct move_bucket *next;
- struct rhash_head hash;
- struct move_bucket_key k;
- unsigned sectors;
- atomic_t count;
-};
-
-#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
deleted file mode 100644
index 5e6de91a8763..000000000000
--- a/fs/bcachefs/movinggc.c
+++ /dev/null
@@ -1,476 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "backpointers.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "errcode.h"
-#include "error.h"
-#include "lru.h"
-#include "move.h"
-#include "movinggc.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/sched/task.h>
-#include <linux/wait.h>
-
-struct buckets_in_flight {
- struct rhashtable *table;
- struct move_bucket *first;
- struct move_bucket *last;
- size_t nr;
- size_t sectors;
-
- DARRAY(struct move_bucket *) to_evacuate;
-};
-
-static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket, hash),
- .key_offset = offsetof(struct move_bucket, k),
- .key_len = sizeof(struct move_bucket_key),
- .automatic_shrinking = true,
-};
-
-static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
-{
- if (!list->first)
- list->first = b;
- else
- list->last->next = b;
-
- list->last = b;
- list->nr++;
- list->sectors += b->sectors;
-}
-
-static int bch2_bucket_is_movable(struct btree_trans *trans,
- struct move_bucket *b, u64 time)
-{
- struct bch_fs *c = trans->c;
-
- if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
- return 0;
-
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_cached);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p);
- if (!ca)
- goto out;
-
- if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
- goto out;
-
- if (ca->mi.state != BCH_MEMBER_STATE_rw ||
- !bch2_dev_is_online(ca))
- goto out;
-
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- b->k.gen = a->gen;
- b->sectors = bch2_bucket_sectors_dirty(*a);
- u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
-
- ret = lru_idx && lru_idx <= time;
-out:
- bch2_dev_put(ca);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static void move_bucket_free(struct buckets_in_flight *list,
- struct move_bucket *b)
-{
- int ret = rhashtable_remove_fast(list->table, &b->hash,
- bch_move_bucket_params);
- BUG_ON(ret);
- kfree(b);
-}
-
-static void move_buckets_wait(struct moving_context *ctxt,
- struct buckets_in_flight *list,
- bool flush)
-{
- struct move_bucket *i;
-
- while ((i = list->first)) {
- if (flush)
- move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
-
- if (atomic_read(&i->count))
- break;
-
- list->first = i->next;
- if (!list->first)
- list->last = NULL;
-
- list->nr--;
- list->sectors -= i->sectors;
-
- move_bucket_free(list, i);
- }
-
- bch2_trans_unlock_long(ctxt->trans);
-}
-
-static bool bucket_in_flight(struct buckets_in_flight *list,
- struct move_bucket_key k)
-{
- return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
-}
-
-static int bch2_copygc_get_buckets(struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
- size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
- int ret;
-
- move_buckets_wait(ctxt, buckets_in_flight, false);
-
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (bch2_err_matches(ret, EROFS))
- return ret;
-
- if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
- return ret;
-
- ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
- 0, k, ({
- struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
- int ret2 = 0;
-
- saw++;
-
- ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
- if (ret2 < 0)
- goto err;
-
- if (!ret2)
- not_movable++;
- else if (bucket_in_flight(buckets_in_flight, b.k))
- in_flight++;
- else {
- struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
- ret2 = b_i ? 0 : -ENOMEM;
- if (ret2)
- goto err;
-
- *b_i = b;
-
- ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
- if (ret2) {
- kfree(b_i);
- goto err;
- }
-
- ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
- bch_move_bucket_params);
- BUG_ON(ret2);
-
- sectors += b.sectors;
- }
-
- ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
-err:
- ret2;
- }));
-
- pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
- buckets_in_flight->nr, buckets_in_flight->sectors,
- saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
-
- return ret < 0 ? ret : 0;
-}
-
-noinline
-static int bch2_copygc(struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight,
- bool *did_work)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct data_update_opts data_opts = {
- .btree_insert_flags = BCH_WATERMARK_copygc,
- };
- u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
- u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
- int ret = 0;
-
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
- if (ret)
- goto err;
-
- darray_for_each(buckets_in_flight->to_evacuate, i) {
- if (kthread_should_stop() || freezing(current))
- break;
-
- struct move_bucket *b = *i;
- *i = NULL;
-
- move_bucket_in_flight_add(buckets_in_flight, b);
-
- ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
- if (ret)
- goto err;
-
- *did_work = true;
- }
-err:
- /* no entries in LRU btree found, or got to end: */
- if (bch2_err_matches(ret, ENOENT))
- ret = 0;
-
- if (ret < 0 && !bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "from bch2_move_data()");
-
- sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
- sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
- trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
-
- darray_for_each(buckets_in_flight->to_evacuate, i)
- if (*i)
- move_bucket_free(buckets_in_flight, *i);
- darray_exit(&buckets_in_flight->to_evacuate);
- return ret;
-}
-
-static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
-{
- struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
- struct bch_dev_usage usage;
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- usage.buckets[i] = usage_full.d[i].buckets;
-
- s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
- ca->mi.bucket_size) >> 1);
- s64 fragmented = 0;
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- if (data_type_movable(i))
- fragmented += usage_full.d[i].fragmented;
-
- return max(0LL, fragmented_allowed - fragmented);
-}
-
-/*
- * Copygc runs when the amount of fragmented data is above some arbitrary
- * threshold:
- *
- * The threshold at the limit - when the device is full - is the amount of space
- * we reserved in bch2_recalc_capacity; we can't have more than that amount of
- * disk space stranded due to fragmentation and store everything we have
- * promised to store.
- *
- * But we don't want to be running copygc unnecessarily when the device still
- * has plenty of free space - rather, we want copygc to smoothly run every so
- * often and continually reduce the amount of fragmented space as the device
- * fills up. So, we increase the threshold by half the current free space.
- */
-u64 bch2_copygc_wait_amount(struct bch_fs *c)
-{
- u64 wait = U64_MAX;
-
- guard(rcu)();
- for_each_rw_member_rcu(c, ca)
- wait = min(wait, bch2_copygc_dev_wait_amount(ca));
- return wait;
-}
-
-void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
-{
- printbuf_tabstop_push(out, 32);
- prt_printf(out, "running:\t%u\n", c->copygc_running);
- prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait);
- prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at);
-
- prt_printf(out, "Currently waiting for:\t");
- prt_human_readable_u64(out, max(0LL, c->copygc_wait -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- prt_newline(out);
-
- prt_printf(out, "Currently waiting since:\t");
- prt_human_readable_u64(out, max(0LL,
- atomic64_read(&c->io_clock[WRITE].now) -
- c->copygc_wait_at) << 9);
- prt_newline(out);
-
- bch2_printbuf_make_room(out, 4096);
-
- struct task_struct *t;
- out->atomic++;
- scoped_guard(rcu) {
- prt_printf(out, "Currently calculated wait:\n");
- for_each_rw_member_rcu(c, ca) {
- prt_printf(out, " %s:\t", ca->name);
- prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
- prt_newline(out);
- }
-
- t = rcu_dereference(c->copygc_thread);
- if (t)
- get_task_struct(t);
- }
- --out->atomic;
-
- if (t) {
- bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
- put_task_struct(t);
- }
-}
-
-static int bch2_copygc_thread(void *arg)
-{
- struct bch_fs *c = arg;
- struct moving_context ctxt;
- struct bch_move_stats move_stats;
- struct io_clock *clock = &c->io_clock[WRITE];
- struct buckets_in_flight buckets = {};
- u64 last, wait;
-
- buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL);
- int ret = !buckets.table
- ? -ENOMEM
- : rhashtable_init(buckets.table, &bch_move_bucket_params);
- bch_err_msg(c, ret, "allocating copygc buckets in flight");
- if (ret)
- goto err;
-
- set_freezable();
-
- /*
- * Data move operations can't run until after check_snapshots has
- * completed, and bch2_snapshot_is_ancestor() is available.
- */
- kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
- kthread_should_stop());
-
- bch2_move_stats_init(&move_stats, "copygc");
- bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
- writepoint_ptr(&c->copygc_write_point),
- false);
-
- while (!ret && !kthread_should_stop()) {
- bool did_work = false;
-
- bch2_trans_unlock_long(ctxt.trans);
- cond_resched();
-
- if (!c->opts.copygc_enabled) {
- move_buckets_wait(&ctxt, &buckets, true);
- kthread_wait_freezable(c->opts.copygc_enabled ||
- kthread_should_stop());
- }
-
- if (unlikely(freezing(current))) {
- move_buckets_wait(&ctxt, &buckets, true);
- __refrigerator(false);
- continue;
- }
-
- last = atomic64_read(&clock->now);
- wait = bch2_copygc_wait_amount(c);
-
- if (wait > clock->max_slop) {
- c->copygc_wait_at = last;
- c->copygc_wait = last + wait;
- move_buckets_wait(&ctxt, &buckets, true);
- trace_and_count(c, copygc_wait, c, wait, last + wait);
- bch2_kthread_io_clock_wait(clock, last + wait,
- MAX_SCHEDULE_TIMEOUT);
- continue;
- }
-
- c->copygc_wait = 0;
-
- c->copygc_running = true;
- ret = bch2_copygc(&ctxt, &buckets, &did_work);
- c->copygc_running = false;
-
- wake_up(&c->copygc_running_wq);
-
- if (!wait && !did_work) {
- u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
- if (min_member_capacity == U64_MAX)
- min_member_capacity = 128 * 2048;
-
- move_buckets_wait(&ctxt, &buckets, true);
- bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
- MAX_SCHEDULE_TIMEOUT);
- }
- }
-
- move_buckets_wait(&ctxt, &buckets, true);
- rhashtable_destroy(buckets.table);
- bch2_moving_ctxt_exit(&ctxt);
- bch2_move_stats_exit(&move_stats, c);
-err:
- kfree(buckets.table);
- return ret;
-}
-
-void bch2_copygc_stop(struct bch_fs *c)
-{
- if (c->copygc_thread) {
- kthread_stop(c->copygc_thread);
- put_task_struct(c->copygc_thread);
- }
- c->copygc_thread = NULL;
-}
-
-int bch2_copygc_start(struct bch_fs *c)
-{
- struct task_struct *t;
- int ret;
-
- if (c->copygc_thread)
- return 0;
-
- if (c->opts.nochanges)
- return 0;
-
- if (bch2_fs_init_fault("copygc_start"))
- return -ENOMEM;
-
- t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
- ret = PTR_ERR_OR_ZERO(t);
- bch_err_msg(c, ret, "creating copygc thread");
- if (ret)
- return ret;
-
- get_task_struct(t);
-
- c->copygc_thread = t;
- wake_up_process(c->copygc_thread);
-
- return 0;
-}
-
-void bch2_fs_copygc_init(struct bch_fs *c)
-{
- init_waitqueue_head(&c->copygc_running_wq);
- c->copygc_running = false;
-}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
deleted file mode 100644
index f615910d6f98..000000000000
--- a/fs/bcachefs/movinggc.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_MOVINGGC_H
-#define _BCACHEFS_MOVINGGC_H
-
-u64 bch2_copygc_wait_amount(struct bch_fs *);
-void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
-
-static inline void bch2_copygc_wakeup(struct bch_fs *c)
-{
- guard(rcu)();
- struct task_struct *p = rcu_dereference(c->copygc_thread);
- if (p)
- wake_up_process(p);
-}
-
-void bch2_copygc_stop(struct bch_fs *);
-int bch2_copygc_start(struct bch_fs *);
-void bch2_fs_copygc_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
deleted file mode 100644
index c3f87c59922d..000000000000
--- a/fs/bcachefs/namei.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "inode.h"
-#include "namei.h"
-#include "subvolume.h"
-#include "xattr.h"
-
-#include <linux/posix_acl.h>
-
-static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode)
-{
- return (subvol_inum) {
- .subvol = inode->bi_parent_subvol ?: inum.subvol,
- .inum = inode->bi_dir,
- };
-}
-
-static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
-{
- return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
-}
-
-int bch2_create_trans(struct btree_trans *trans,
- subvol_inum dir,
- struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *new_inode,
- const struct qstr *name,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct posix_acl *default_acl,
- struct posix_acl *acl,
- subvol_inum snapshot_src,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = {};
- struct btree_iter inode_iter = {};
- subvol_inum new_inum = dir;
- u64 now = bch2_current_time(c);
- u64 cpu = raw_smp_processor_id();
- u64 dir_target;
- u32 snapshot;
- unsigned dir_type = mode_to_type(mode);
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
- BTREE_ITER_intent|BTREE_ITER_with_updates);
- if (ret)
- goto err;
-
- if (!(flags & BCH_CREATE_SNAPSHOT)) {
- /* Normal create path - allocate a new inode: */
- bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u);
-
- if (flags & BCH_CREATE_TMPFILE)
- new_inode->bi_flags |= BCH_INODE_unlinked;
-
- ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
- if (ret)
- goto err;
-
- snapshot_src = (subvol_inum) { 0 };
- } else {
- /*
- * Creating a snapshot - we're not allocating a new inode, but
- * we do have to lookup the root inode of the subvolume we're
- * snapshotting and update it (in the new snapshot):
- */
-
- if (!snapshot_src.inum) {
- /* Inode wasn't specified, just snapshot: */
- struct bch_subvolume s;
- ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
- if (ret)
- goto err;
-
- snapshot_src.inum = le64_to_cpu(s.inode);
- }
-
- ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (new_inode->bi_subvol != snapshot_src.subvol) {
- /* Not a subvolume root: */
- ret = -EINVAL;
- goto err;
- }
-
- /*
- * If we're not root, we have to own the subvolume being
- * snapshotted:
- */
- if (uid && new_inode->bi_uid != uid) {
- ret = -EPERM;
- goto err;
- }
-
- flags |= BCH_CREATE_SUBVOL;
- }
-
- new_inum.inum = new_inode->bi_inum;
- dir_target = new_inode->bi_inum;
-
- if (flags & BCH_CREATE_SUBVOL) {
- u32 new_subvol, dir_snapshot;
-
- ret = bch2_subvolume_create(trans, new_inode->bi_inum,
- dir.subvol,
- snapshot_src.subvol,
- &new_subvol, &snapshot,
- (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
- if (ret)
- goto err;
-
- new_inode->bi_parent_subvol = dir.subvol;
- new_inode->bi_subvol = new_subvol;
- new_inum.subvol = new_subvol;
- dir_target = new_subvol;
- dir_type = DT_SUBVOL;
-
- ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
- if (ret)
- goto err;
-
- bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
- ret = bch2_btree_iter_traverse(trans, &dir_iter);
- if (ret)
- goto err;
- }
-
- if (!(flags & BCH_CREATE_SNAPSHOT)) {
- if (default_acl) {
- ret = bch2_set_acl_trans(trans, new_inum, new_inode,
- default_acl, ACL_TYPE_DEFAULT);
- if (ret)
- goto err;
- }
-
- if (acl) {
- ret = bch2_set_acl_trans(trans, new_inum, new_inode,
- acl, ACL_TYPE_ACCESS);
- if (ret)
- goto err;
- }
- }
-
- if (!(flags & BCH_CREATE_TMPFILE)) {
- struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
- u64 dir_offset;
-
- if (is_subdir_for_nlink(new_inode))
- dir_u->bi_nlink++;
- dir_u->bi_mtime = dir_u->bi_ctime = now;
-
- ret = bch2_dirent_create(trans, dir, &dir_hash,
- dir_type,
- name,
- dir_target,
- &dir_offset,
- STR_HASH_must_create|BTREE_ITER_with_updates) ?:
- bch2_inode_write(trans, &dir_iter, dir_u);
- if (ret)
- goto err;
-
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
- }
-
- if (S_ISDIR(mode)) {
- ret = bch2_maybe_propagate_has_case_insensitive(trans,
- (subvol_inum) {
- new_inode->bi_subvol ?: dir.subvol,
- new_inode->bi_inum },
- new_inode);
- if (ret)
- goto err;
- }
-
- if (S_ISDIR(mode) &&
- !new_inode->bi_subvol)
- new_inode->bi_depth = dir_u->bi_depth + 1;
-
- inode_iter.flags &= ~BTREE_ITER_all_snapshots;
- bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
-
- ret = bch2_btree_iter_traverse(trans, &inode_iter) ?:
- bch2_inode_write(trans, &inode_iter, new_inode);
-err:
- bch2_trans_iter_exit(trans, &inode_iter);
- bch2_trans_iter_exit(trans, &dir_iter);
- return ret;
-}
-
-int bch2_link_trans(struct btree_trans *trans,
- subvol_inum dir, struct bch_inode_unpacked *dir_u,
- subvol_inum inum, struct bch_inode_unpacked *inode_u,
- const struct qstr *name)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = {};
- struct btree_iter inode_iter = {};
- struct bch_hash_info dir_hash;
- u64 now = bch2_current_time(c);
- u64 dir_offset = 0;
- int ret;
-
- if (dir.subvol != inum.subvol)
- return -EXDEV;
-
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
- if (ret)
- return ret;
-
- inode_u->bi_ctime = now;
- ret = bch2_inode_nlink_inc(inode_u);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (bch2_reinherit_attrs(inode_u, dir_u)) {
- ret = -EXDEV;
- goto err;
- }
-
- dir_u->bi_mtime = dir_u->bi_ctime = now;
-
- dir_hash = bch2_hash_info_init(c, dir_u);
-
- ret = bch2_dirent_create(trans, dir, &dir_hash,
- mode_to_type(inode_u->bi_mode),
- name, inum.inum,
- &dir_offset,
- STR_HASH_must_create);
- if (ret)
- goto err;
-
- inode_u->bi_dir = dir.inum;
- inode_u->bi_dir_offset = dir_offset;
-
- ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
- bch2_inode_write(trans, &inode_iter, inode_u);
-err:
- bch2_trans_iter_exit(trans, &dir_iter);
- bch2_trans_iter_exit(trans, &inode_iter);
- return ret;
-}
-
-int bch2_unlink_trans(struct btree_trans *trans,
- subvol_inum dir,
- struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *inode_u,
- const struct qstr *name,
- bool deleting_subvol)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = {};
- struct btree_iter dirent_iter = {};
- struct btree_iter inode_iter = {};
- struct bch_hash_info dir_hash;
- subvol_inum inum;
- u64 now = bch2_current_time(c);
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
- if (ret)
- goto err;
-
- dir_hash = bch2_hash_info_init(c, dir_u);
-
- ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
- name, &inum, BTREE_ITER_intent);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
- ret = bch2_empty_dir_trans(trans, inum);
- if (ret)
- goto err;
- }
-
- if (deleting_subvol && !inode_u->bi_subvol) {
- ret = bch_err_throw(c, ENOENT_not_subvol);
- goto err;
- }
-
- if (inode_u->bi_subvol) {
- /* Recursive subvolume destroy not allowed (yet?) */
- ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
- if (ret)
- goto err;
- }
-
- if (deleting_subvol || inode_u->bi_subvol) {
- ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
- if (ret)
- goto err;
-
- k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- /*
- * If we're deleting a subvolume, we need to really delete the
- * dirent, not just emit a whiteout in the current snapshot:
- */
- bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
- ret = bch2_btree_iter_traverse(trans, &dirent_iter);
- if (ret)
- goto err;
- } else {
- bch2_inode_nlink_dec(trans, inode_u);
- }
-
- if (inode_u->bi_dir == dirent_iter.pos.inode &&
- inode_u->bi_dir_offset == dirent_iter.pos.offset) {
- inode_u->bi_dir = 0;
- inode_u->bi_dir_offset = 0;
- }
-
- dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
- dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
-
- ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash, &dirent_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_inode_write(trans, &dir_iter, dir_u) ?:
- bch2_inode_write(trans, &inode_iter, inode_u);
-err:
- bch2_trans_iter_exit(trans, &inode_iter);
- bch2_trans_iter_exit(trans, &dirent_iter);
- bch2_trans_iter_exit(trans, &dir_iter);
- return ret;
-}
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
- struct bch_inode_unpacked *src_u)
-{
- u64 src, dst;
- unsigned id;
- bool ret = false;
-
- for (id = 0; id < Inode_opt_nr; id++) {
- if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold)
- continue;
-
- /* Skip attributes that were explicitly set on this inode */
- if (dst_u->bi_fields_set & (1 << id))
- continue;
-
- src = bch2_inode_opt_get(src_u, id);
- dst = bch2_inode_opt_get(dst_u, id);
-
- if (src == dst)
- continue;
-
- bch2_inode_opt_set(dst_u, id, src);
- ret = true;
- }
-
- return ret;
-}
-
-static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
-{
- struct btree_iter iter;
- struct bkey_i_subvolume *s =
- bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_cached, subvolume);
- int ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- s->v.fs_path_parent = cpu_to_le32(new_parent);
- bch2_trans_iter_exit(trans, &iter);
- return 0;
-}
-
-int bch2_rename_trans(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
- subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
- struct bch_inode_unpacked *src_inode_u,
- struct bch_inode_unpacked *dst_inode_u,
- const struct qstr *src_name,
- const struct qstr *dst_name,
- enum bch_rename_mode mode)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter src_dir_iter = {};
- struct btree_iter dst_dir_iter = {};
- struct btree_iter src_inode_iter = {};
- struct btree_iter dst_inode_iter = {};
- struct bch_hash_info src_hash, dst_hash;
- subvol_inum src_inum, dst_inum;
- u64 src_offset, dst_offset;
- u64 now = bch2_current_time(c);
- int ret;
-
- ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- src_hash = bch2_hash_info_init(c, src_dir_u);
-
- if (!subvol_inum_eq(dst_dir, src_dir)) {
- ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- dst_hash = bch2_hash_info_init(c, dst_dir_u);
- } else {
- dst_dir_u = src_dir_u;
- dst_hash = src_hash;
- }
-
- ret = bch2_dirent_rename(trans,
- src_dir, &src_hash,
- dst_dir, &dst_hash,
- src_name, &src_inum, &src_offset,
- dst_name, &dst_inum, &dst_offset,
- mode);
- if (ret)
- goto err;
-
- ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
- BTREE_ITER_intent);
- if (ret)
- goto err;
-
- if (dst_inum.inum) {
- ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
- BTREE_ITER_intent);
- if (ret)
- goto err;
- }
-
- if (src_inode_u->bi_subvol &&
- dst_dir.subvol != src_inode_u->bi_parent_subvol) {
- ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
- if (ret)
- goto err;
- }
-
- if (mode == BCH_RENAME_EXCHANGE &&
- dst_inode_u->bi_subvol &&
- src_dir.subvol != dst_inode_u->bi_parent_subvol) {
- ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
- if (ret)
- goto err;
- }
-
- /* Can't move across subvolumes, unless it's a subvolume root: */
- if (src_dir.subvol != dst_dir.subvol &&
- (!src_inode_u->bi_subvol ||
- (dst_inum.inum && !dst_inode_u->bi_subvol))) {
- ret = -EXDEV;
- goto err;
- }
-
- if (src_inode_u->bi_parent_subvol)
- src_inode_u->bi_parent_subvol = dst_dir.subvol;
-
- if ((mode == BCH_RENAME_EXCHANGE) &&
- dst_inode_u->bi_parent_subvol)
- dst_inode_u->bi_parent_subvol = src_dir.subvol;
-
- src_inode_u->bi_dir = dst_dir_u->bi_inum;
- src_inode_u->bi_dir_offset = dst_offset;
-
- if (mode == BCH_RENAME_EXCHANGE) {
- dst_inode_u->bi_dir = src_dir_u->bi_inum;
- dst_inode_u->bi_dir_offset = src_offset;
- }
-
- if (mode == BCH_RENAME_OVERWRITE &&
- dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
- dst_inode_u->bi_dir_offset == src_offset) {
- dst_inode_u->bi_dir = 0;
- dst_inode_u->bi_dir_offset = 0;
- }
-
- if (mode == BCH_RENAME_OVERWRITE) {
- if (S_ISDIR(src_inode_u->bi_mode) !=
- S_ISDIR(dst_inode_u->bi_mode)) {
- ret = -ENOTDIR;
- goto err;
- }
-
- if (S_ISDIR(dst_inode_u->bi_mode)) {
- ret = bch2_empty_dir_trans(trans, dst_inum);
- if (ret)
- goto err;
- }
- }
-
- if (!subvol_inum_eq(dst_dir, src_dir)) {
- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
- S_ISDIR(src_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
-
- if (mode == BCH_RENAME_EXCHANGE &&
- bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
- S_ISDIR(dst_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
-
- ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?:
- (mode == BCH_RENAME_EXCHANGE
- ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u)
- : 0);
- if (ret)
- goto err;
-
- if (is_subdir_for_nlink(src_inode_u)) {
- src_dir_u->bi_nlink--;
- dst_dir_u->bi_nlink++;
- }
-
- if (S_ISDIR(src_inode_u->bi_mode) &&
- !src_inode_u->bi_subvol)
- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
-
- if (mode == BCH_RENAME_EXCHANGE &&
- S_ISDIR(dst_inode_u->bi_mode) &&
- !dst_inode_u->bi_subvol)
- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
- }
-
- if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
- dst_dir_u->bi_nlink--;
- src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
- }
-
- if (mode == BCH_RENAME_OVERWRITE)
- bch2_inode_nlink_dec(trans, dst_inode_u);
-
- src_dir_u->bi_mtime = now;
- src_dir_u->bi_ctime = now;
-
- if (src_dir.inum != dst_dir.inum) {
- dst_dir_u->bi_mtime = now;
- dst_dir_u->bi_ctime = now;
- }
-
- src_inode_u->bi_ctime = now;
-
- if (dst_inum.inum)
- dst_inode_u->bi_ctime = now;
-
- ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
- (src_dir.inum != dst_dir.inum
- ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
- : 0) ?:
- bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
- (dst_inum.inum
- ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
- : 0);
-err:
- bch2_trans_iter_exit(trans, &dst_inode_iter);
- bch2_trans_iter_exit(trans, &src_inode_iter);
- bch2_trans_iter_exit(trans, &dst_dir_iter);
- bch2_trans_iter_exit(trans, &src_dir_iter);
- return ret;
-}
-
-/* inum_to_path */
-
-static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
-{
- bch2_printbuf_make_room(out, n);
-
- unsigned can_print = min(n, printbuf_remaining(out));
-
- b += n;
-
- for (unsigned i = 0; i < can_print; i++)
- out->buf[out->pos++] = *((char *) --b);
-
- printbuf_nul_terminate(out);
-}
-
-static inline void prt_str_reversed(struct printbuf *out, const char *s)
-{
- prt_bytes_reversed(out, s, strlen(s));
-}
-
-static inline void reverse_bytes(void *b, size_t n)
-{
- char *e = b + n, *s = b;
-
- while (s < e) {
- --e;
- swap(*s, *e);
- s++;
- }
-}
-
-static int __bch2_inum_to_path(struct btree_trans *trans,
- u32 subvol, u64 inum, u32 snapshot,
- struct printbuf *path)
-{
- unsigned orig_pos = path->pos;
- int ret = 0;
- DARRAY(subvol_inum) inums = {};
-
- if (!snapshot) {
- ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
- if (ret)
- goto disconnected;
- }
-
- while (true) {
- subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum };
-
- if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) {
- prt_str_reversed(path, "(loop)");
- break;
- }
-
- ret = darray_push(&inums, n);
- if (ret)
- goto err;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
- if (ret)
- goto disconnected;
-
- if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL &&
- inode.bi_inum == BCACHEFS_ROOT_INO)
- break;
-
- if (!inode.bi_dir && !inode.bi_dir_offset) {
- ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
- goto disconnected;
- }
-
- inum = inode.bi_dir;
- if (inode.bi_parent_subvol) {
- subvol = inode.bi_parent_subvol;
- ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot);
- if (ret)
- goto disconnected;
- }
-
- struct btree_iter d_iter;
- struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
- BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
- 0, dirent);
- ret = bkey_err(d.s_c);
- if (ret)
- goto disconnected;
-
- struct qstr dirent_name = bch2_dirent_get_name(d);
-
- prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
-
- prt_char(path, '/');
-
- bch2_trans_iter_exit(trans, &d_iter);
- }
-
- if (orig_pos == path->pos)
- prt_char(path, '/');
-out:
- ret = path->allocation_failure ? -ENOMEM : 0;
- if (ret)
- goto err;
-
- reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
- darray_exit(&inums);
- return 0;
-err:
- darray_exit(&inums);
- return ret;
-disconnected:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- prt_str_reversed(path, "(disconnected)");
- goto out;
-}
-
-int bch2_inum_to_path(struct btree_trans *trans,
- subvol_inum inum,
- struct printbuf *path)
-{
- return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path);
-}
-
-int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot,
- snapshot_id_list *snapshot_overwrites,
- struct printbuf *path)
-{
- return __bch2_inum_to_path(trans, 0, inum, snapshot, path);
-}
-
-/* fsck */
-
-static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- bool in_fsck)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = {};
- int ret = 0;
-
- if (inode_points_to_dirent(target, d))
- return 0;
-
- if (!bch2_inode_has_backpointer(target)) {
- fsck_err_on(S_ISDIR(target->bi_mode),
- trans, inode_dir_missing_backpointer,
- "directory with missing backpointer\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- target->bi_flags &= ~BCH_INODE_unlinked;
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- return __bch2_fsck_write_inode(trans, target);
- }
-
- struct bkey_s_c_dirent bp_dirent =
- bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
- 0, dirent);
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- bool backpointer_exists = !ret;
- ret = 0;
-
- if (!backpointer_exists) {
- if (fsck_err(trans, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target->bi_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, target);
- }
- } else {
- printbuf_reset(&buf);
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
- if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
- /*
- * XXX: verify connectivity of the other dirent
- * up to the root before removing this one
- *
- * Additionally, bch2_lookup would need to cope with the
- * dirent it found being removed - or should we remove
- * the other one, even though the inode points to it?
- */
- if (in_fsck) {
- if (fsck_err(trans, inode_dir_multiple_links,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf))
- ret = bch2_fsck_remove_dirent(trans, d.k->p);
- } else {
- bch2_fs_inconsistent(c,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf);
- }
-
- goto out;
- } else {
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(!target->bi_nlink,
- trans, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
- ret = __bch2_fsck_write_inode(trans, target);
- if (ret)
- goto err;
- }
- }
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int __bch2_check_dirent_target(struct btree_trans *trans,
- struct btree_iter *dirent_iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- bool in_fsck)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
- if (ret)
- goto err;
-
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
- trans, dirent_d_type_wrong,
- "incorrect d_type: got %s, should be %s:\n%s",
- bch2_d_type_str(d.v->d_type),
- bch2_d_type_str(inode_d_type(target)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = inode_d_type(target);
- if (n->v.d_type == DT_SUBVOL) {
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
- } else {
- n->v.d_inum = cpu_to_le64(target->bi_inum);
- }
-
- ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
- BTREE_UPDATE_internal_snapshot_node);
- if (ret)
- goto err;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * BCH_INODE_has_case_insensitive:
- * We have to track whether directories have any descendent directory that is
- * casefolded - for overlayfs:
- */
-
-static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum)
-{
- struct btree_iter iter = {};
- int ret = 0;
-
- while (true) {
- struct bch_inode_unpacked inode;
- ret = bch2_inode_peek(trans, &iter, &inode, inum,
- BTREE_ITER_intent|BTREE_ITER_with_updates);
- if (ret)
- break;
-
- if (inode.bi_flags & BCH_INODE_has_case_insensitive)
- break;
-
- inode.bi_flags |= BCH_INODE_has_case_insensitive;
- ret = bch2_inode_write(trans, &iter, &inode);
- if (ret)
- break;
-
- bch2_trans_iter_exit(trans, &iter);
- if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM))
- break;
-
- inum = parent_inum(inum, &inode);
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- if (!bch2_inode_casefold(trans->c, inode))
- return 0;
-
- inode->bi_flags |= BCH_INODE_has_case_insensitive;
-
- return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode));
-}
-
-int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- snapshot_id_list *snapshot_overwrites,
- bool *do_update)
-{
- struct printbuf buf = PRINTBUF;
- bool repairing_parents = false;
- int ret = 0;
-
- if (!S_ISDIR(inode->bi_mode)) {
- /*
- * Old versions set bi_casefold for non dirs, but that's
- * unnecessary and wasteful
- */
- if (inode->bi_casefold) {
- inode->bi_casefold = 0;
- *do_update = true;
- }
- return 0;
- }
-
- if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive)
- return 0;
-
- if (bch2_inode_casefold(trans->c, inode) &&
- !(inode->bi_flags & BCH_INODE_has_case_insensitive)) {
- prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
- inode->bi_inum, inode->bi_snapshot);
-
- ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
- snapshot_overwrites, &buf);
- if (ret)
- goto err;
-
- if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
- inode->bi_flags |= BCH_INODE_has_case_insensitive;
- *do_update = true;
- }
- }
-
- if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
- goto out;
-
- struct bch_inode_unpacked dir = *inode;
- u32 snapshot = dir.bi_snapshot;
-
- while (!(dir.bi_inum == BCACHEFS_ROOT_INO &&
- dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
- if (dir.bi_parent_subvol) {
- ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
- if (ret)
- goto err;
-
- snapshot_overwrites = NULL;
- }
-
- ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
- if (ret)
- goto err;
-
- if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
- prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
-
- ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
- snapshot_overwrites, &buf);
- if (ret)
- goto err;
-
- if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
- dir.bi_flags |= BCH_INODE_has_case_insensitive;
- ret = __bch2_fsck_write_inode(trans, &dir);
- if (ret)
- goto err;
- }
- }
-
- /*
- * We only need to check the first parent, unless we find an
- * inconsistency
- */
- if (!repairing_parents)
- break;
- }
-out:
-err:
-fsck_err:
- printbuf_exit(&buf);
- if (ret)
- return ret;
-
- if (repairing_parents) {
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- }
-
- return 0;
-}
diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h
deleted file mode 100644
index ae6ebc2d0785..000000000000
--- a/fs/bcachefs/namei.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NAMEI_H
-#define _BCACHEFS_NAMEI_H
-
-#include "dirent.h"
-
-struct posix_acl;
-
-#define BCH_CREATE_TMPFILE (1U << 0)
-#define BCH_CREATE_SUBVOL (1U << 1)
-#define BCH_CREATE_SNAPSHOT (1U << 2)
-#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
-
-int bch2_create_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- const struct qstr *,
- uid_t, gid_t, umode_t, dev_t,
- struct posix_acl *,
- struct posix_acl *,
- subvol_inum, unsigned);
-
-int bch2_link_trans(struct btree_trans *,
- subvol_inum, struct bch_inode_unpacked *,
- subvol_inum, struct bch_inode_unpacked *,
- const struct qstr *);
-
-int bch2_unlink_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- const struct qstr *, bool);
-
-int bch2_rename_trans(struct btree_trans *,
- subvol_inum, struct bch_inode_unpacked *,
- subvol_inum, struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
- const struct qstr *,
- const struct qstr *,
- enum bch_rename_mode);
-
-bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
- struct bch_inode_unpacked *);
-
-int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
-int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32,
- snapshot_id_list *, struct printbuf *);
-
-int __bch2_check_dirent_target(struct btree_trans *,
- struct btree_iter *,
- struct bkey_s_c_dirent,
- struct bch_inode_unpacked *, bool);
-
-static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static inline int bch2_check_dirent_target(struct btree_trans *trans,
- struct btree_iter *dirent_iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target,
- bool in_fsck)
-{
- if (likely(inode_points_to_dirent(target, d) &&
- d.v->d_type == inode_d_type(target)))
- return 0;
-
- return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
-}
-
-int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *,
- snapshot_id_list *, bool *);
-
-#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
deleted file mode 100644
index 962218fa68ec..000000000000
--- a/fs/bcachefs/nocow_locking.c
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bkey_methods.h"
-#include "nocow_locking.h"
-#include "util.h"
-
-#include <linux/closure.h>
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
- return true;
- return false;
-}
-
-#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
-
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
- int lock_val = flags ? 1 : -1;
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (l->b[i] == dev_bucket) {
- int v = atomic_sub_return(lock_val, &l->l[i]);
-
- BUG_ON(v && sign(v) != lock_val);
- if (!v)
- closure_wake_up(&l->wait);
- return;
- }
-
- BUG();
-}
-
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
-{
- int v, lock_val = flags ? 1 : -1;
- unsigned i;
-
- spin_lock(&l->lock);
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (l->b[i] == dev_bucket)
- goto got_entry;
-
- for (i = 0; i < ARRAY_SIZE(l->b); i++)
- if (!atomic_read(&l->l[i])) {
- l->b[i] = dev_bucket;
- goto take_lock;
- }
-fail:
- spin_unlock(&l->lock);
- return false;
-got_entry:
- v = atomic_read(&l->l[i]);
- if (lock_val > 0 ? v < 0 : v > 0)
- goto fail;
-take_lock:
- v = atomic_read(&l->l[i]);
- /* Overflow? */
- if (v && sign(v + lock_val) != sign(v))
- goto fail;
-
- atomic_add(lock_val, &l->l[i]);
- spin_unlock(&l->lock);
- return true;
-}
-
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct nocow_lock_bucket *l,
- u64 dev_bucket, int flags)
-{
- if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
- struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
- u64 start_time = local_clock();
-
- __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
- bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
- }
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
-
-{
- unsigned i, nr_zero = 0;
- struct nocow_lock_bucket *l;
-
- for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
- unsigned v = 0;
-
- for (i = 0; i < ARRAY_SIZE(l->l); i++)
- v |= atomic_read(&l->l[i]);
-
- if (!v) {
- nr_zero++;
- continue;
- }
-
- if (nr_zero)
- prt_printf(out, "(%u empty entries)\n", nr_zero);
- nr_zero = 0;
-
- for (i = 0; i < ARRAY_SIZE(l->l); i++) {
- int v = atomic_read(&l->l[i]);
- if (v) {
- bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
- prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
- }
- }
- prt_newline(out);
- }
-
- if (nr_zero)
- prt_printf(out, "(%u empty entries)\n", nr_zero);
-}
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *c)
-{
- struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
- for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
- for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
- BUG_ON(atomic_read(&l->l[j]));
-}
-
-void bch2_fs_nocow_locking_init_early(struct bch_fs *c)
-{
- struct bucket_nocow_lock_table *t = &c->nocow_locks;
-
- for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
- spin_lock_init(&l->lock);
-}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
deleted file mode 100644
index 48b8a003c0d2..000000000000
--- a/fs/bcachefs/nocow_locking.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_H
-#define _BCACHEFS_NOCOW_LOCKING_H
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "nocow_locking_types.h"
-
-#include <linux/hash.h>
-
-static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- u64 dev_bucket)
-{
- unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
-
- return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
-}
-
-#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0)
-
-bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
-void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
-bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
-void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
- struct nocow_lock_bucket *, u64, int);
-
-static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
-}
-
-static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
- struct bpos bucket, int flags)
-{
- u64 dev_bucket = bucket_to_u64(bucket);
- struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
-
- return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
-}
-
-void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
-
-void bch2_fs_nocow_locking_exit(struct bch_fs *);
-void bch2_fs_nocow_locking_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
deleted file mode 100644
index bd12bf677924..000000000000
--- a/fs/bcachefs/nocow_locking_types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
-#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
-
-#define BUCKET_NOCOW_LOCKS_BITS 10
-#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS)
-
-struct nocow_lock_bucket {
- struct closure_waitlist wait;
- spinlock_t lock;
- u64 b[4];
- atomic_t l[4];
-} __aligned(SMP_CACHE_BYTES);
-
-struct bucket_nocow_lock_table {
- struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS];
-};
-
-#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
-
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
deleted file mode 100644
index b1cf88905b81..000000000000
--- a/fs/bcachefs/opts.c
+++ /dev/null
@@ -1,844 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/kernel.h>
-#include <linux/fs_parser.h>
-
-#include "bcachefs.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "movinggc.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "recovery_passes.h"
-#include "super-io.h"
-#include "util.h"
-
-#define x(t, n, ...) [n] = #t,
-
-const char * const bch2_error_actions[] = {
- BCH_ERROR_ACTIONS()
- NULL
-};
-
-const char * const bch2_degraded_actions[] = {
- BCH_DEGRADED_ACTIONS()
- NULL
-};
-
-const char * const bch2_fsck_fix_opts[] = {
- BCH_FIX_ERRORS_OPTS()
- NULL
-};
-
-const char * const bch2_version_upgrade_opts[] = {
- BCH_VERSION_UPGRADE_OPTS()
- NULL
-};
-
-const char * const bch2_sb_features[] = {
- BCH_SB_FEATURES()
- NULL
-};
-
-const char * const bch2_sb_compat[] = {
- BCH_SB_COMPAT()
- NULL
-};
-
-const char * const __bch2_btree_ids[] = {
- BCH_BTREE_IDS()
- NULL
-};
-
-const char * const __bch2_csum_types[] = {
- BCH_CSUM_TYPES()
- NULL
-};
-
-const char * const __bch2_csum_opts[] = {
- BCH_CSUM_OPTS()
- NULL
-};
-
-const char * const __bch2_compression_types[] = {
- BCH_COMPRESSION_TYPES()
- NULL
-};
-
-const char * const bch2_compression_opts[] = {
- BCH_COMPRESSION_OPTS()
- NULL
-};
-
-const char * const __bch2_str_hash_types[] = {
- BCH_STR_HASH_TYPES()
- NULL
-};
-
-const char * const bch2_str_hash_opts[] = {
- BCH_STR_HASH_OPTS()
- NULL
-};
-
-const char * const __bch2_data_types[] = {
- BCH_DATA_TYPES()
- NULL
-};
-
-const char * const bch2_member_states[] = {
- BCH_MEMBER_STATES()
- NULL
-};
-
-static const char * const __bch2_jset_entry_types[] = {
- BCH_JSET_ENTRY_TYPES()
- NULL
-};
-
-static const char * const __bch2_fs_usage_types[] = {
- BCH_FS_USAGE_TYPES()
- NULL
-};
-
-#undef x
-
-static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
- unsigned nr, const char *type, unsigned idx)
-{
- if (idx < nr)
- prt_str(out, opts[idx]);
- else
- prt_printf(out, "(unknown %s %u)", type, idx);
-}
-
-#define PRT_STR_OPT_BOUNDSCHECKED(name, type) \
-void bch2_prt_##name(struct printbuf *out, type t) \
-{ \
- prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
-}
-
-PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type);
-PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
-PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
-PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt);
-PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
-PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
-PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
-
-static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
- struct printbuf *err)
-{
- if (!val) {
- *res = FSCK_FIX_yes;
- } else {
- int ret = match_string(bch2_fsck_fix_opts, -1, val);
-
- if (ret < 0 && err)
- prt_str(err, "fix_errors: invalid selection");
- if (ret < 0)
- return ret;
- *res = ret;
- }
-
- return 0;
-}
-
-static void bch2_opt_fix_errors_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
-{
- prt_str(out, bch2_fsck_fix_opts[v]);
-}
-
-#define bch2_opt_fix_errors (struct bch_opt_fn) { \
- .parse = bch2_opt_fix_errors_parse, \
- .to_text = bch2_opt_fix_errors_to_text, \
-}
-
-const char * const bch2_d_types[BCH_DT_MAX] = {
- [DT_UNKNOWN] = "unknown",
- [DT_FIFO] = "fifo",
- [DT_CHR] = "chr",
- [DT_DIR] = "dir",
- [DT_BLK] = "blk",
- [DT_REG] = "reg",
- [DT_LNK] = "lnk",
- [DT_SOCK] = "sock",
- [DT_WHT] = "whiteout",
- [DT_SUBVOL] = "subvol",
-};
-
-void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
-{
-#define x(_name, ...) \
- if (opt_defined(src, _name)) \
- opt_set(*dst, _name, src._name);
-
- BCH_OPTS()
-#undef x
-}
-
-bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
- switch (id) {
-#define x(_name, ...) \
- case Opt_##_name: \
- return opt_defined(*opts, _name);
- BCH_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
-{
- switch (id) {
-#define x(_name, ...) \
- case Opt_##_name: \
- return opts->_name;
- BCH_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
-{
- switch (id) {
-#define x(_name, ...) \
- case Opt_##_name: \
- opt_set(*opts, _name, v); \
- break;
- BCH_OPTS()
-#undef x
- default:
- BUG();
- }
-}
-
-/* dummy option, for options that aren't stored in the superblock */
-typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
-typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
-typedef u64 (*member_opt_get_fn)(const struct bch_member *);
-typedef void (*member_opt_set_fn)(struct bch_member *, u64);
-
-__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
-__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
-__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
-
-#define type_compatible_or_null(_p, _type) \
- __builtin_choose_expr( \
- __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
-
-const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
-#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
- .min = _min, .max = _max
-#define OPT_STR(_choices) .type = BCH_OPT_STR, \
- .min = 0, .max = ARRAY_SIZE(_choices) - 1, \
- .choices = _choices
-#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
- .min = 0, .max = U64_MAX, \
- .choices = _choices
-#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
- .choices = _choices
-#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
-
-#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
- [Opt_##_name] = { \
- .attr.name = #_name, \
- .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
- .flags = _flags, \
- .hint = _hint, \
- .help = _help, \
- .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \
- .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \
- .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \
- .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
- _type \
- },
-
- BCH_OPTS()
-#undef x
-};
-
-int bch2_opt_lookup(const char *name)
-{
- const struct bch_option *i;
-
- for (i = bch2_opt_table;
- i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
- i++)
- if (!strcmp(name, i->attr.name))
- return i - bch2_opt_table;
-
- return -1;
-}
-
-struct opt_synonym {
- const char *s1, *s2;
-};
-
-static const struct opt_synonym bch2_opt_synonyms[] = {
- { "quota", "usrquota" },
-};
-
-static int bch2_mount_opt_lookup(const char *name)
-{
- const struct opt_synonym *i;
-
- for (i = bch2_opt_synonyms;
- i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms);
- i++)
- if (!strcmp(name, i->s1))
- name = i->s2;
-
- return bch2_opt_lookup(name);
-}
-
-struct opt_val_synonym {
- const char *opt, *v1, *v2;
-};
-
-static const struct opt_val_synonym bch2_opt_val_synonyms[] = {
- { "degraded", "true", "yes" },
- { "degraded", "false", "no" },
- { "degraded", "1", "yes" },
- { "degraded", "0", "no" },
-};
-
-static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val)
-{
- const struct opt_val_synonym *i;
-
- for (i = bch2_opt_val_synonyms;
- i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms);
- i++)
- if (!strcmp(opt, i->opt) && !strcmp(val, i->v1))
- return i->v2;
-
- return val;
-}
-
-int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
-{
- if (v < opt->min) {
- if (err)
- prt_printf(err, "%s: too small (min %llu)",
- opt->attr.name, opt->min);
- return -BCH_ERR_ERANGE_option_too_small;
- }
-
- if (opt->max && v >= opt->max) {
- if (err)
- prt_printf(err, "%s: too big (max %llu)",
- opt->attr.name, opt->max);
- return -BCH_ERR_ERANGE_option_too_big;
- }
-
- if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
- if (err)
- prt_printf(err, "%s: not a multiple of 512",
- opt->attr.name);
- return -BCH_ERR_opt_parse_error;
- }
-
- if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
- if (err)
- prt_printf(err, "%s: must be a power of two",
- opt->attr.name);
- return -BCH_ERR_opt_parse_error;
- }
-
- if (opt->fn.validate)
- return opt->fn.validate(v, err);
-
- return 0;
-}
-
-int bch2_opt_parse(struct bch_fs *c,
- const struct bch_option *opt,
- const char *val, u64 *res,
- struct printbuf *err)
-{
- ssize_t ret;
-
- if (err)
- printbuf_indent_add_nextline(err, 2);
-
- switch (opt->type) {
- case BCH_OPT_BOOL:
- if (!val)
- val = "1";
-
- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
- if (ret != -BCH_ERR_option_not_bool) {
- *res = ret;
- } else {
- if (err)
- prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
- }
- break;
- case BCH_OPT_UINT:
- if (!val) {
- prt_printf(err, "%s: required value",
- opt->attr.name);
- return -EINVAL;
- }
-
- if (*val != '-') {
- ret = opt->flags & OPT_HUMAN_READABLE
- ? bch2_strtou64_h(val, res)
- : kstrtou64(val, 10, res);
- } else {
- prt_printf(err, "%s: must be a non-negative number", opt->attr.name);
- return -BCH_ERR_option_negative;
- }
-
- if (ret < 0) {
- if (err)
- prt_printf(err, "%s: must be a number",
- opt->attr.name);
- return ret;
- }
- break;
- case BCH_OPT_STR:
- if (!val) {
- prt_printf(err, "%s: required value",
- opt->attr.name);
- return -EINVAL;
- }
-
- ret = match_string(opt->choices, -1, val);
- if (ret < 0) {
- if (err)
- prt_printf(err, "%s: invalid selection",
- opt->attr.name);
- return ret;
- }
-
- *res = ret;
- break;
- case BCH_OPT_BITFIELD: {
- s64 v = bch2_read_flag_list(val, opt->choices);
- if (v < 0)
- return v;
- *res = v;
- break;
- }
- case BCH_OPT_FN:
- ret = opt->fn.parse(c, val, res, err);
-
- if (ret == -BCH_ERR_option_needs_open_fs)
- return ret;
-
- if (ret < 0) {
- if (err)
- prt_printf(err, "%s: parse error",
- opt->attr.name);
- return ret;
- }
- }
-
- return bch2_opt_validate(opt, *res, err);
-}
-
-void bch2_opt_to_text(struct printbuf *out,
- struct bch_fs *c, struct bch_sb *sb,
- const struct bch_option *opt, u64 v,
- unsigned flags)
-{
- if (flags & OPT_SHOW_MOUNT_STYLE) {
- if (opt->type == BCH_OPT_BOOL) {
- prt_printf(out, "%s%s",
- v ? "" : "no",
- opt->attr.name);
- return;
- }
-
- prt_printf(out, "%s=", opt->attr.name);
- }
-
- switch (opt->type) {
- case BCH_OPT_BOOL:
- case BCH_OPT_UINT:
- if (opt->flags & OPT_HUMAN_READABLE)
- prt_human_readable_u64(out, v);
- else
- prt_printf(out, "%lli", v);
- break;
- case BCH_OPT_STR:
- if (v < opt->min || v >= opt->max)
- prt_printf(out, "(invalid option %lli)", v);
- else if (flags & OPT_SHOW_FULL_LIST)
- prt_string_option(out, opt->choices, v);
- else
- prt_str(out, opt->choices[v]);
- break;
- case BCH_OPT_BITFIELD:
- prt_bitflags(out, opt->choices, v);
- break;
- case BCH_OPT_FN:
- opt->fn.to_text(out, c, sb, v);
- break;
- default:
- BUG();
- }
-}
-
-void bch2_opts_to_text(struct printbuf *out,
- struct bch_opts opts,
- struct bch_fs *c, struct bch_sb *sb,
- unsigned show_mask, unsigned hide_mask,
- unsigned flags)
-{
- bool first = true;
-
- for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
-
- if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
- continue;
-
- u64 v = bch2_opt_get_by_id(&opts, i);
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- if (!first)
- prt_char(out, ',');
- first = false;
-
- bch2_opt_to_text(out, c, sb, opt, v, flags);
- }
-}
-
-int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
-{
- int ret = 0;
-
- switch (id) {
- case Opt_state:
- if (ca)
- return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
- break;
-
- case Opt_compression:
- case Opt_background_compression:
- ret = bch2_check_set_has_compressed_data(c, v);
- break;
- case Opt_erasure_code:
- if (v)
- bch2_check_set_feature(c, BCH_FEATURE_ec);
- break;
- default:
- break;
- }
-
- return ret;
-}
-
-int bch2_opts_hooks_pre_set(struct bch_fs *c)
-{
- for (unsigned i = 0; i < bch2_opts_nr; i++) {
- int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
- struct bch_opts *new_opts, enum bch_opt_id id)
-{
- switch (id) {
- case Opt_foreground_target:
- if (new_opts->foreground_target &&
- !new_opts->background_target)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
- case Opt_compression:
- if (new_opts->compression &&
- !new_opts->background_compression)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
- case Opt_background_target:
- if (new_opts->background_target)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
- case Opt_background_compression:
- if (new_opts->background_compression)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
- case Opt_rebalance_enabled:
- bch2_rebalance_wakeup(c);
- break;
- case Opt_copygc_enabled:
- bch2_copygc_wakeup(c);
- break;
- case Opt_discard:
- if (!ca) {
- mutex_lock(&c->sb_lock);
- for_each_member_device(c, ca) {
- struct bch_member *m =
- bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_DISCARD(m, c->opts.discard);
- }
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
- break;
- case Opt_version_upgrade:
- /*
- * XXX: in the future we'll likely want to do compatible
- * upgrades at runtime as well, but right now there's nothing
- * that does that:
- */
- if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
- bch2_sb_upgrade_incompat(c);
- break;
- default:
- break;
- }
-}
-
-int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
- struct printbuf *parse_later,
- const char *name, const char *val)
-{
- struct printbuf err = PRINTBUF;
- u64 v;
- int ret, id;
-
- id = bch2_mount_opt_lookup(name);
-
- /* Check for the form "noopt", negation of a boolean opt: */
- if (id < 0 &&
- !val &&
- !strncmp("no", name, 2)) {
- id = bch2_mount_opt_lookup(name + 2);
- val = "0";
- }
-
- /* Unknown options are ignored: */
- if (id < 0)
- return 0;
-
- /* must have a value for synonym lookup - but OPT_FN is weird */
- if (!val && bch2_opt_table[id].type != BCH_OPT_FN)
- val = "1";
-
- val = bch2_opt_val_synonym_lookup(name, val);
-
- if (!(bch2_opt_table[id].flags & OPT_MOUNT))
- goto bad_opt;
-
- if (id == Opt_acl &&
- !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
- goto bad_opt;
-
- if ((id == Opt_usrquota ||
- id == Opt_grpquota) &&
- !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
- goto bad_opt;
-
- ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
- if (ret == -BCH_ERR_option_needs_open_fs) {
- ret = 0;
-
- if (parse_later) {
- prt_printf(parse_later, "%s=%s,", name, val);
- if (parse_later->allocation_failure)
- ret = -ENOMEM;
- }
-
- goto out;
- }
-
- if (ret < 0)
- goto bad_val;
-
- if (opts)
- bch2_opt_set_by_id(opts, id, v);
-
- ret = 0;
-out:
- printbuf_exit(&err);
- return ret;
-bad_opt:
- ret = -BCH_ERR_option_name;
- goto out;
-bad_val:
- ret = -BCH_ERR_option_value;
- goto out;
-}
-
-int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
- struct printbuf *parse_later, char *options,
- bool ignore_unknown)
-{
- char *copied_opts, *copied_opts_start;
- char *opt, *name, *val;
- int ret = 0;
-
- if (!options)
- return 0;
-
- /*
- * sys_fsconfig() is now occasionally providing us with option lists
- * starting with a comma - weird.
- */
- if (*options == ',')
- options++;
-
- copied_opts = kstrdup(options, GFP_KERNEL);
- if (!copied_opts)
- return -ENOMEM;
- copied_opts_start = copied_opts;
-
- while ((opt = strsep(&copied_opts, ",")) != NULL) {
- if (!*opt)
- continue;
-
- name = strsep(&opt, "=");
- val = opt;
-
- ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
- if (ret == -BCH_ERR_option_name && ignore_unknown)
- ret = 0;
- if (ret) {
- pr_err("Error parsing option %s: %s", name, bch2_err_str(ret));
- break;
- }
- }
-
- kfree(copied_opts_start);
- return ret;
-}
-
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
-{
- const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
-
- if (dev_idx < 0) {
- v = opt->get_sb(sb);
- } else {
- if (WARN(!bch2_member_exists(sb, dev_idx),
- "tried to set device option %s on nonexistent device %i",
- opt->attr.name, dev_idx))
- return 0;
-
- struct bch_member m = bch2_sb_member_get(sb, dev_idx);
- v = opt->get_member(&m);
- }
-
- if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
- --v;
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = 1ULL << v;
-
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v <<= 9;
-
- return v;
-}
-
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
-{
- for (unsigned id = 0; id < bch2_opts_nr; id++) {
- const struct bch_option *opt = bch2_opt_table + id;
-
- if (opt->get_sb)
- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
- }
-
- return 0;
-}
-
-bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
- const struct bch_option *opt, u64 v)
-{
- bool changed = false;
-
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v >>= 9;
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = ilog2(v);
-
- if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
- v++;
-
- if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) {
- changed = v != opt->get_sb(sb);
-
- opt->set_sb(sb, v);
- }
-
- if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
- if (WARN(!bch2_member_exists(sb, dev_idx),
- "tried to set device option %s on nonexistent device %i",
- opt->attr.name, dev_idx))
- return false;
-
- struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
- changed = v != opt->get_member(m);
- opt->set_member(m, v);
- }
-
- return changed;
-}
-
-bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
- const struct bch_option *opt, u64 v)
-{
- mutex_lock(&c->sb_lock);
- bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
- if (changed)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- return changed;
-}
-
-/* io opts: */
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
-{
- struct bch_io_opts opts = {
-#define x(_name, _bits) ._name = src._name,
- BCH_INODE_OPTS()
-#undef x
- };
-
- bch2_io_opts_fixups(&opts);
- return opts;
-}
-
-bool bch2_opt_is_inode_opt(enum bch_opt_id id)
-{
- static const enum bch_opt_id inode_opt_list[] = {
-#define x(_name, _bits) Opt_##_name,
- BCH_INODE_OPTS()
-#undef x
- };
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
- if (inode_opt_list[i] == id)
- return true;
-
- return false;
-}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
deleted file mode 100644
index 63f8e254495c..000000000000
--- a/fs/bcachefs/opts.h
+++ /dev/null
@@ -1,693 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_OPTS_H
-#define _BCACHEFS_OPTS_H
-
-#include <linux/bug.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-#include <linux/sysfs.h>
-#include "bcachefs_format.h"
-
-struct bch_fs;
-
-extern const char * const bch2_error_actions[];
-extern const char * const bch2_degraded_actions[];
-extern const char * const bch2_fsck_fix_opts[];
-extern const char * const bch2_version_upgrade_opts[];
-extern const char * const bch2_sb_features[];
-extern const char * const bch2_sb_compat[];
-extern const char * const __bch2_btree_ids[];
-extern const char * const __bch2_csum_types[];
-extern const char * const __bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
-extern const char * const bch2_compression_opts[];
-extern const char * const __bch2_str_hash_types[];
-extern const char * const bch2_str_hash_opts[];
-extern const char * const __bch2_data_types[];
-extern const char * const bch2_member_states[];
-extern const char * const bch2_d_types[];
-
-void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type);
-void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
-void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
-void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt);
-void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
-void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
-void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
-
-static inline const char *bch2_d_type_str(unsigned d_type)
-{
- return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
-}
-
-/*
- * Mount options; we also store defaults in the superblock.
- *
- * Also exposed via sysfs: if an option is writeable, and it's also stored in
- * the superblock, changing it via sysfs (currently? might change this) also
- * updates the superblock.
- *
- * We store options as signed integers, where -1 means undefined. This means we
- * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
- * apply the options from that struct that are defined.
- */
-
-/* When can be set: */
-enum opt_flags {
- OPT_FS = BIT(0), /* Filesystem option */
- OPT_DEVICE = BIT(1), /* Device option */
- OPT_INODE = BIT(2), /* Inode option */
- OPT_FORMAT = BIT(3), /* May be specified at format time */
- OPT_MOUNT = BIT(4), /* May be specified at mount time */
- OPT_RUNTIME = BIT(5), /* May be specified at runtime */
- OPT_HUMAN_READABLE = BIT(6),
- OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
- OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
- OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
- OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
- OPT_HIDDEN = BIT(11),
-};
-
-enum opt_type {
- BCH_OPT_BOOL,
- BCH_OPT_UINT,
- BCH_OPT_STR,
- BCH_OPT_BITFIELD,
- BCH_OPT_FN,
-};
-
-struct bch_opt_fn {
- int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
- void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
- int (*validate)(u64, struct printbuf *);
-};
-
-/**
- * x(name, shortopt, type, in mem type, mode, sb_opt)
- *
- * @name - name of mount option, sysfs attribute, and struct bch_opts
- * member
- *
- * @mode - when opt may be set
- *
- * @sb_option - name of corresponding superblock option
- *
- * @type - one of OPT_BOOL, OPT_UINT, OPT_STR
- */
-
-/*
- * XXX: add fields for
- * - default value
- * - helptext
- */
-
-#ifdef __KERNEL__
-#define RATELIMIT_ERRORS_DEFAULT true
-#else
-#define RATELIMIT_ERRORS_DEFAULT false
-#endif
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCACHEFS_VERBOSE_DEFAULT true
-#else
-#define BCACHEFS_VERBOSE_DEFAULT false
-#endif
-
-#define BCH_FIX_ERRORS_OPTS() \
- x(exit, 0) \
- x(yes, 1) \
- x(no, 2) \
- x(ask, 3)
-
-enum fsck_err_opts {
-#define x(t, n) FSCK_FIX_##t,
- BCH_FIX_ERRORS_OPTS()
-#undef x
-};
-
-#define BCH_OPTS() \
- x(block_size, u16, \
- OPT_FS|OPT_FORMAT| \
- OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(512, 1U << 16), \
- BCH_SB_BLOCK_SIZE, 4 << 10, \
- "size", NULL) \
- x(btree_node_size, u32, \
- OPT_FS|OPT_FORMAT| \
- OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(512, 1U << 20), \
- BCH_SB_BTREE_NODE_SIZE, 256 << 10, \
- "size", "Btree node size, default 256k") \
- x(errors, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
- NULL, "Action to take on filesystem error") \
- x(write_error_timeout, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, 300), \
- BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
- NULL, "Number of consecutive write errors allowed before kicking out a device")\
- x(metadata_replicas, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_META_REPLICAS_WANT, 1, \
- "#", "Number of metadata replicas") \
- x(data_replicas, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_DATA_REPLICAS_WANT, 1, \
- "#", "Number of data replicas") \
- x(metadata_replicas_required, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_META_REPLICAS_REQ, 1, \
- "#", NULL) \
- x(data_replicas_required, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_UINT(1, BCH_REPLICAS_MAX), \
- BCH_SB_DATA_REPLICAS_REQ, 1, \
- "#", NULL) \
- x(encoded_extent_max, u32, \
- OPT_FS|OPT_FORMAT| \
- OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
- OPT_UINT(4096, 2U << 20), \
- BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
- "size", "Maximum size of checksummed/compressed extents")\
- x(metadata_checksum, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(__bch2_csum_opts), \
- BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
- NULL, NULL) \
- x(data_checksum, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(__bch2_csum_opts), \
- BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
- NULL, NULL) \
- x(checksum_err_retry_nr, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, 32), \
- BCH_SB_CSUM_ERR_RETRY_NR, 3, \
- NULL, NULL) \
- x(compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_compression), \
- BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
- NULL, NULL) \
- x(background_compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_compression), \
- BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
- NULL, NULL) \
- x(str_hash, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_str_hash_opts), \
- BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
- NULL, "Hash function for directory entries and xattrs")\
- x(metadata_target, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_METADATA_TARGET, 0, \
- "(target)", "Device or label for metadata writes") \
- x(foreground_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_FOREGROUND_TARGET, 0, \
- "(target)", "Device or label for foreground writes") \
- x(background_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_BACKGROUND_TARGET, 0, \
- "(target)", "Device or label to move data to in the background")\
- x(promote_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_FN(bch2_opt_target), \
- BCH_SB_PROMOTE_TARGET, 0, \
- "(target)", "Device or label to promote data to on read") \
- x(erasure_code, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_ERASURE_CODE, false, \
- NULL, "Enable erasure coding (DO NOT USE YET)") \
- x(casefold, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT, \
- OPT_BOOL(), \
- BCH_SB_CASEFOLD, false, \
- NULL, "Dirent lookups are casefolded") \
- x(casefold_disabled, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Disable casefolding filesystem wide") \
- x(inodes_32bit, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_INODE_32BIT, true, \
- NULL, "Constrain inode numbers to 32 bits") \
- x(shard_inode_numbers_bits, u8, \
- OPT_FS|OPT_FORMAT, \
- OPT_UINT(0, 8), \
- BCH_SB_SHARD_INUMS_NBITS, 0, \
- NULL, "Shard new inode numbers by CPU id") \
- x(inodes_use_key_cache, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_INODES_USE_KEY_CACHE, true, \
- NULL, "Use the btree key cache for the inodes btree") \
- x(btree_node_mem_ptr_optimization, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Stash pointer to in memory btree node in btree ptr")\
- x(gc_reserve_percent, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(5, 21), \
- BCH_SB_GC_RESERVE, 8, \
- "%", "Percentage of disk space to reserve for copygc")\
- x(gc_reserve_bytes, u64, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \
- OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(0, U64_MAX), \
- BCH_SB_GC_RESERVE_BYTES, 0, \
- "%", "Amount of disk space to reserve for copygc\n" \
- "Takes precedence over gc_reserve_percent if set")\
- x(root_reserve_percent, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_UINT(0, 100), \
- BCH_SB_ROOT_RESERVE, 0, \
- "%", "Percentage of disk space to reserve for superuser")\
- x(wide_macs, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_128_BIT_MACS, false, \
- NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\
- x(inline_data, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable inline data extents") \
- x(promote_whole_extents, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
- NULL, "Promote whole extents, instead of just part being read")\
- x(acl, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_POSIX_ACL, true, \
- NULL, "Enable POSIX acls") \
- x(usrquota, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_USRQUOTA, false, \
- NULL, "Enable user quotas") \
- x(grpquota, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_GRPQUOTA, false, \
- NULL, "Enable group quotas") \
- x(prjquota, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH_SB_PRJQUOTA, false, \
- NULL, "Enable project quotas") \
- x(degraded, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_STR(bch2_degraded_actions), \
- BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \
- NULL, "Allow mounting in degraded mode") \
- x(no_splitbrain_check, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don't kick drives out when splitbrain detected")\
- x(verbose, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \
- NULL, "Extra debugging information during mount/recovery")\
- x(journal_flush_delay, u32, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, U32_MAX), \
- BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
- NULL, "Delay in milliseconds before automatic journal commits")\
- x(journal_flush_disabled, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_JOURNAL_FLUSH_DISABLED,false, \
- NULL, "Disable journal flush on sync/fsync\n" \
- "If enabled, writes can be lost, but only since the\n"\
- "last journal write (default 1 second)") \
- x(journal_reclaim_delay, u32, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U32_MAX), \
- BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
- NULL, "Delay in milliseconds before automatic journal reclaim")\
- x(move_bytes_in_flight, u32, \
- OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1024, U32_MAX), \
- BCH2_NO_SB_OPT, 1U << 20, \
- NULL, "Maximum Amount of IO to keep in flight by the move path")\
- x(move_ios_in_flight, u32, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(1, 1024), \
- BCH2_NO_SB_OPT, 32, \
- NULL, "Maximum number of IOs to keep in flight by the move path")\
- x(fsck, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Run fsck on mount") \
- x(fsck_memory_usage_percent, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(20, 70), \
- BCH2_NO_SB_OPT, 50, \
- NULL, "Maximum percentage of system ram fsck is allowed to pin")\
- x(fix_errors, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_FN(bch2_opt_fix_errors), \
- BCH2_NO_SB_OPT, FSCK_FIX_exit, \
- NULL, "Fix errors during fsck without asking") \
- x(ratelimit_errors, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
- NULL, "Ratelimit error messages during fsck") \
- x(nochanges, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Super read only mode - no writes at all will be issued,\n"\
- "even if we have to replay the journal") \
- x(norecovery, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Exit recovery immediately prior to journal replay")\
- x(journal_rewind, u64, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(0, U64_MAX), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Rewind journal") \
- x(recovery_passes, u64, \
- OPT_FS|OPT_MOUNT, \
- OPT_BITFIELD(bch2_recovery_passes), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Recovery passes to run explicitly") \
- x(recovery_passes_exclude, u64, \
- OPT_FS|OPT_MOUNT, \
- OPT_BITFIELD(bch2_recovery_passes), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Recovery passes to exclude") \
- x(recovery_pass_last, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_STR_NOLIMIT(bch2_recovery_passes), \
- BCH2_NO_SB_OPT, 0, \
- NULL, "Exit recovery after specified pass") \
- x(retain_recovery_info, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\
- x(read_entire_journal, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Read all journal entries, not just dirty ones")\
- x(read_journal_only, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Only read the journal, skip the rest of recovery")\
- x(journal_transaction_names, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
- NULL, "Log transaction function names in journal") \
- x(allocator_stuck_timeout, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U16_MAX), \
- BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \
- NULL, "Default timeout in seconds for stuck allocator messages")\
- x(noexcl, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don't open device in exclusive mode") \
- x(direct_io, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Use O_DIRECT (userspace only)") \
- x(sb, u64, \
- OPT_MOUNT, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
- "offset", "Sector offset of superblock") \
- x(read_only, u8, \
- OPT_FS|OPT_MOUNT|OPT_HIDDEN, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, NULL) \
- x(nostart, u8, \
- 0, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Don\'t start filesystem, only open devices") \
- x(reconstruct_alloc, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Reconstruct alloc btree") \
- x(version_upgrade, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_version_upgrade_opts), \
- BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
- NULL, "Set superblock to latest version,\n" \
- "allowing any new features to be used") \
- x(stdio, u64, \
- 0, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Pointer to a struct stdio_redirect") \
- x(project, u8, \
- OPT_INODE, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, NULL) \
- x(nocow, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
- OPT_BOOL(), \
- BCH_SB_NOCOW, false, \
- NULL, "Nocow mode: Writes will be done in place when possible.\n"\
- "Snapshots and reflink will still caused writes to be COW\n"\
- "Implicitly disables data checksumming, compression and encryption")\
- x(nocow_enabled, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable nocow mode: enables runtime locking in\n"\
- "data move path needed if nocow will ever be in use\n")\
- x(copygc_enabled, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable copygc: disable for debugging, or to\n"\
- "quiet the system when doing performance testing\n")\
- x(rebalance_enabled, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable rebalance: disable for debugging, or to\n"\
- "quiet the system when doing performance testing\n")\
- x(rebalance_on_ac_only, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_REBALANCE_AC_ONLY, false, \
- NULL, "Enable rebalance while on mains power only\n") \
- x(auto_snapshot_deletion, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\
- "quiet the system when doing performance testing\n")\
- x(no_data_io, u8, \
- OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Skip submit_bio() for data reads and writes, " \
- "for performance testing purposes") \
- x(state, u64, \
- OPT_DEVICE|OPT_RUNTIME, \
- OPT_STR(bch2_member_states), \
- BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
- "state", "rw,ro,failed,spare") \
- x(bucket_size, u32, \
- OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
- OPT_UINT(0, S64_MAX), \
- BCH_MEMBER_BUCKET_SIZE, 0, \
- "size", "Specifies the bucket size; must be greater than the btree node size")\
- x(durability, u8, \
- OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \
- OPT_UINT(0, BCH_REPLICAS_MAX), \
- BCH_MEMBER_DURABILITY, 1, \
- "n", "Data written to this device will be considered\n"\
- "to have already been replicated n times") \
- x(data_allowed, u8, \
- OPT_DEVICE, \
- OPT_BITFIELD(__bch2_data_types), \
- BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
- "types", "Allowed data types for this device: journal, btree, and/or user")\
- x(discard, u8, \
- OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_MEMBER_DISCARD, true, \
- NULL, "Enable discard/TRIM support") \
- x(btree_node_prefetch, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\
- " prefetched sequentially")
-
-struct bch_opts {
-#define x(_name, _bits, ...) unsigned _name##_defined:1;
- BCH_OPTS()
-#undef x
-
-#define x(_name, _bits, ...) _bits _name;
- BCH_OPTS()
-#undef x
-};
-
-struct bch2_opts_parse {
- struct bch_opts opts;
-
- /* to save opts that can't be parsed before the FS is opened: */
- struct printbuf parse_later;
-};
-
-static const __maybe_unused struct bch_opts bch2_opts_default = {
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
- ._name##_defined = true, \
- ._name = _default, \
-
- BCH_OPTS()
-#undef x
-};
-
-#define opt_defined(_opts, _name) ((_opts)._name##_defined)
-
-#define opt_get(_opts, _name) \
- (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
-
-#define opt_set(_opts, _name, _v) \
-do { \
- (_opts)._name##_defined = true; \
- (_opts)._name = _v; \
-} while (0)
-
-static inline struct bch_opts bch2_opts_empty(void)
-{
- return (struct bch_opts) { 0 };
-}
-
-void bch2_opts_apply(struct bch_opts *, struct bch_opts);
-
-enum bch_opt_id {
-#define x(_name, ...) Opt_##_name,
- BCH_OPTS()
-#undef x
- bch2_opts_nr
-};
-
-struct bch_fs;
-struct printbuf;
-
-struct bch_option {
- struct attribute attr;
- enum opt_type type;
- enum opt_flags flags;
- u64 min, max;
-
- const char * const *choices;
-
- struct bch_opt_fn fn;
-
- const char *hint;
- const char *help;
-
- u64 (*get_sb)(const struct bch_sb *);
- void (*set_sb)(struct bch_sb *, u64);
-
- u64 (*get_member)(const struct bch_member *);
- void (*set_member)(struct bch_member *, u64);
-
-};
-
-extern const struct bch_option bch2_opt_table[];
-
-bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
-u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
-void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
-int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
-
-struct bch_dev;
-bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
-
-int bch2_opt_lookup(const char *);
-int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
- const char *, u64 *, struct printbuf *);
-
-#define OPT_SHOW_FULL_LIST (1 << 0)
-#define OPT_SHOW_MOUNT_STYLE (1 << 1)
-
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
- const struct bch_option *, u64, unsigned);
-void bch2_opts_to_text(struct printbuf *,
- struct bch_opts,
- struct bch_fs *, struct bch_sb *,
- unsigned, unsigned, unsigned);
-
-int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
-int bch2_opts_hooks_pre_set(struct bch_fs *);
-void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
- struct bch_opts *, enum bch_opt_id);
-
-int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
- struct printbuf *, const char *, const char *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
- char *, bool);
-
-/* inode opts: */
-
-struct bch_io_opts {
-#define x(_name, _bits) u##_bits _name;
- BCH_INODE_OPTS()
-#undef x
-#define x(_name, _bits) u64 _name##_from_inode:1;
- BCH_INODE_OPTS()
-#undef x
-};
-
-static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
-{
- if (!opts->background_target)
- opts->background_target = opts->foreground_target;
- if (!opts->background_compression)
- opts->background_compression = opts->compression;
- if (opts->nocow) {
- opts->compression = opts->background_compression = 0;
- opts->data_checksum = 0;
- opts->erasure_code = 0;
- }
-}
-
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-bool bch2_opt_is_inode_opt(enum bch_opt_id);
-
-#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
deleted file mode 100644
index 3302bbc78a09..000000000000
--- a/fs/bcachefs/printbuf.c
+++ /dev/null
@@ -1,528 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1+
-/* Copyright (C) 2022 Kent Overstreet */
-
-#include <linux/bitmap.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string_helpers.h>
-
-#include "printbuf.h"
-
-static inline unsigned __printbuf_linelen(struct printbuf *buf, unsigned pos)
-{
- return pos - buf->last_newline;
-}
-
-static inline unsigned printbuf_linelen(struct printbuf *buf)
-{
- return __printbuf_linelen(buf, buf->pos);
-}
-
-/*
- * Returns spaces from start of line, if set, or 0 if unset:
- */
-static inline unsigned cur_tabstop(struct printbuf *buf)
-{
- return buf->cur_tabstop < buf->nr_tabstops
- ? buf->_tabstops[buf->cur_tabstop]
- : 0;
-}
-
-int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
-{
- /* Reserved space for terminating nul: */
- extra += 1;
-
- if (out->pos + extra <= out->size)
- return 0;
-
- if (!out->heap_allocated) {
- out->overflow = true;
- return 0;
- }
-
- unsigned new_size = roundup_pow_of_two(out->size + extra);
-
- /* Sanity check... */
- if (new_size > PAGE_SIZE << MAX_PAGE_ORDER) {
- out->allocation_failure = true;
- out->overflow = true;
- return -ENOMEM;
- }
-
- /*
- * Note: output buffer must be freeable with kfree(), it's not required
- * that the user use printbuf_exit().
- */
- char *buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
-
- if (!buf) {
- out->allocation_failure = true;
- out->overflow = true;
- return -ENOMEM;
- }
-
- out->buf = buf;
- out->size = new_size;
- return 0;
-}
-
-static void printbuf_advance_pos(struct printbuf *out, unsigned len)
-{
- out->pos += min(len, printbuf_remaining(out));
-}
-
-static void printbuf_insert_spaces(struct printbuf *out, unsigned pos, unsigned nr)
-{
- unsigned move = out->pos - pos;
-
- bch2_printbuf_make_room(out, nr);
-
- if (pos + nr < out->size)
- memmove(out->buf + pos + nr,
- out->buf + pos,
- min(move, out->size - 1 - pos - nr));
-
- if (pos < out->size)
- memset(out->buf + pos, ' ', min(nr, out->size - pos));
-
- printbuf_advance_pos(out, nr);
- printbuf_nul_terminate_reserved(out);
-}
-
-static void __printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
- while (true) {
- int pad;
- unsigned len = out->pos - pos;
- char *p = out->buf + pos;
- char *n = memscan(p, '\n', len);
- if (cur_tabstop(out)) {
- n = min(n, (char *) memscan(p, '\r', len));
- n = min(n, (char *) memscan(p, '\t', len));
- }
-
- pos = n - out->buf;
- if (pos == out->pos)
- break;
-
- switch (*n) {
- case '\n':
- pos++;
- out->last_newline = pos;
-
- printbuf_insert_spaces(out, pos, out->indent);
-
- pos = min(pos + out->indent, out->pos);
- out->last_field = pos;
- out->cur_tabstop = 0;
- break;
- case '\r':
- memmove(n, n + 1, out->pos - pos);
- --out->pos;
- pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos);
- if (pad > 0) {
- printbuf_insert_spaces(out, out->last_field, pad);
- pos += pad;
- }
-
- out->last_field = pos;
- out->cur_tabstop++;
- break;
- case '\t':
- pad = (int) cur_tabstop(out) - (int) __printbuf_linelen(out, pos) - 1;
- if (pad > 0) {
- *n = ' ';
- printbuf_insert_spaces(out, pos, pad - 1);
- pos += pad;
- } else {
- memmove(n, n + 1, out->pos - pos);
- --out->pos;
- }
-
- out->last_field = pos;
- out->cur_tabstop++;
- break;
- }
- }
-}
-
-static inline void printbuf_do_indent(struct printbuf *out, unsigned pos)
-{
- if (out->has_indent_or_tabstops && !out->suppress_indent_tabstop_handling)
- __printbuf_do_indent(out, pos);
-}
-
-void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
-{
- int len;
-
- do {
- va_list args2;
-
- va_copy(args2, args);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args2);
- va_end(args2);
- } while (len > printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len));
-
- unsigned indent_pos = out->pos;
- printbuf_advance_pos(out, len);
- printbuf_do_indent(out, indent_pos);
-}
-
-void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
-{
- va_list args;
- int len;
-
- do {
- va_start(args, fmt);
- len = vsnprintf(out->buf + out->pos, printbuf_remaining_size(out), fmt, args);
- va_end(args);
- } while (len > printbuf_remaining(out) &&
- !bch2_printbuf_make_room(out, len));
-
- unsigned indent_pos = out->pos;
- printbuf_advance_pos(out, len);
- printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
- * null terminated
- * @buf: printbuf to terminate
- * Returns: Printbuf contents, as a nul terminated C string
- */
-const char *bch2_printbuf_str(const struct printbuf *buf)
-{
- /*
- * If we've written to a printbuf then it's guaranteed to be a null
- * terminated string - but if we haven't, then we might not have
- * allocated a buffer at all:
- */
- return buf->pos
- ? buf->buf
- : "";
-}
-
-/**
- * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
- * against accidental use.
- * @buf: printbuf to exit
- */
-void bch2_printbuf_exit(struct printbuf *buf)
-{
- if (buf->heap_allocated) {
- kfree(buf->buf);
- buf->buf = ERR_PTR(-EINTR); /* poison value */
- }
-}
-
-void bch2_printbuf_tabstops_reset(struct printbuf *buf)
-{
- buf->nr_tabstops = 0;
-}
-
-void bch2_printbuf_tabstop_pop(struct printbuf *buf)
-{
- if (buf->nr_tabstops)
- --buf->nr_tabstops;
-}
-
-/*
- * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
- *
- * @buf: printbuf to control
- * @spaces: number of spaces from previous tabpstop
- *
- * In the future this function may allocate memory if setting more than
- * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
- * of line.
- */
-int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
-{
- unsigned prev_tabstop = buf->nr_tabstops
- ? buf->_tabstops[buf->nr_tabstops - 1]
- : 0;
-
- if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
- return -EINVAL;
-
- buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
- buf->has_indent_or_tabstops = true;
- return 0;
-}
-
-/**
- * bch2_printbuf_indent_add() - add to the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces more spaces.
- */
-void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
-{
- if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
- spaces = 0;
-
- buf->indent += spaces;
- prt_chars(buf, ' ', spaces);
-
- buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_add_nextline() - add to the current indent level for
- * subsequent lines
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to add to the current indent level
- *
- * Subsequent lines - not the current line - will be indented by @spaces more
- * spaces.
- */
-void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces)
-{
- if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
- spaces = 0;
-
- buf->indent += spaces;
- buf->has_indent_or_tabstops = true;
-}
-
-/**
- * bch2_printbuf_indent_sub() - subtract from the current indent level
- *
- * @buf: printbuf to control
- * @spaces: number of spaces to subtract from the current indent level
- *
- * Subsequent lines, and the current line if the output position is at the start
- * of the current line, will be indented by @spaces less spaces.
- */
-void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
-{
- if (WARN_ON_ONCE(spaces > buf->indent))
- spaces = buf->indent;
-
- if (buf->last_newline + buf->indent == buf->pos) {
- buf->pos -= spaces;
- printbuf_nul_terminate(buf);
- }
- buf->indent -= spaces;
-
- if (!buf->indent && !buf->nr_tabstops)
- buf->has_indent_or_tabstops = false;
-}
-
-void bch2_prt_newline(struct printbuf *buf)
-{
- bch2_printbuf_make_room(buf, 1 + buf->indent);
-
- __prt_char_reserved(buf, '\n');
-
- buf->last_newline = buf->pos;
-
- __prt_chars_reserved(buf, ' ', buf->indent);
-
- printbuf_nul_terminate_reserved(buf);
-
- buf->last_field = buf->pos;
- buf->cur_tabstop = 0;
-}
-
-void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
-{
- for (int p = out->pos - 1; p >= 0; --p) {
- if (out->buf[p] == '\n') {
- out->pos = p;
- break;
- }
- if (out->buf[p] != ' ')
- break;
- }
-
- printbuf_nul_terminate_reserved(out);
-}
-
-static void __prt_tab(struct printbuf *out)
-{
- int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
-
- prt_chars(out, ' ', spaces);
-
- out->last_field = out->pos;
- out->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab() - Advance printbuf to the next tabstop
- * @out: printbuf to control
- *
- * Advance output to the next tabstop by printing spaces.
- */
-void bch2_prt_tab(struct printbuf *out)
-{
- if (WARN_ON(!cur_tabstop(out)))
- return;
-
- __prt_tab(out);
-}
-
-static void __prt_tab_rjust(struct printbuf *buf)
-{
- int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
- if (pad > 0)
- printbuf_insert_spaces(buf, buf->last_field, pad);
-
- buf->last_field = buf->pos;
- buf->cur_tabstop++;
-}
-
-/**
- * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
- * previous output
- *
- * @buf: printbuf to control
- *
- * Advance output to the next tabstop by inserting spaces immediately after the
- * previous tabstop, right justifying previously outputted text.
- */
-void bch2_prt_tab_rjust(struct printbuf *buf)
-{
- if (WARN_ON(!cur_tabstop(buf)))
- return;
-
- __prt_tab_rjust(buf);
-}
-
-/**
- * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
- *
- * @out: output printbuf
- * @str: string to print
- * @count: number of bytes to print
- *
- * The following contol characters are handled as so:
- * \n: prt_newline newline that obeys current indent level
- * \t: prt_tab advance to next tabstop
- * \r: prt_tab_rjust advance to next tabstop, with right justification
- */
-void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
-{
- unsigned indent_pos = out->pos;
- prt_bytes(out, str, count);
- printbuf_do_indent(out, indent_pos);
-}
-
-/**
- * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
- * @out: output printbuf
- * @v: integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
-{
- bch2_printbuf_make_room(out, 10);
- unsigned len = string_get_size(v, 1, !out->si_units,
- out->buf + out->pos,
- printbuf_remaining_size(out));
- printbuf_advance_pos(out, len);
-}
-
-/**
- * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
- * @out: output printbuf
- * @v: integer to print
- *
- * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
- */
-void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
-{
- if (v < 0)
- prt_char(out, '-');
- bch2_prt_human_readable_u64(out, abs(v));
-}
-
-/**
- * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
- * @out: output printbuf
- * @v: integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_u64(struct printbuf *out, u64 v)
-{
- if (out->human_readable_units)
- bch2_prt_human_readable_u64(out, v);
- else
- bch2_prt_printf(out, "%llu", v);
-}
-
-/**
- * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
- * @out: output printbuf
- * @v: integer to print
- *
- * Units are either raw (default), or human reabable units (controlled via
- * @buf->human_readable_units)
- */
-void bch2_prt_units_s64(struct printbuf *out, s64 v)
-{
- if (v < 0)
- prt_char(out, '-');
- bch2_prt_units_u64(out, abs(v));
-}
-
-void bch2_prt_string_option(struct printbuf *out,
- const char * const list[],
- size_t selected)
-{
- for (size_t i = 0; list[i]; i++)
- bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_prt_bitflags(struct printbuf *out,
- const char * const list[], u64 flags)
-{
- unsigned bit, nr = 0;
- bool first = true;
-
- while (list[nr])
- nr++;
-
- while (flags && (bit = __ffs64(flags)) < nr) {
- if (!first)
- bch2_prt_printf(out, ",");
- first = false;
- bch2_prt_printf(out, "%s", list[bit]);
- flags ^= BIT_ULL(bit);
- }
-}
-
-void bch2_prt_bitflags_vector(struct printbuf *out,
- const char * const list[],
- unsigned long *v, unsigned nr)
-{
- bool first = true;
- unsigned i;
-
- for (i = 0; i < nr; i++)
- if (!list[i]) {
- nr = i - 1;
- break;
- }
-
- for_each_set_bit(i, v, nr) {
- if (!first)
- bch2_prt_printf(out, ",");
- first = false;
- bch2_prt_printf(out, "%s", list[i]);
- }
-}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
deleted file mode 100644
index 8f4e28d440ac..000000000000
--- a/fs/bcachefs/printbuf.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ */
-/* Copyright (C) 2022 Kent Overstreet */
-
-#ifndef _BCACHEFS_PRINTBUF_H
-#define _BCACHEFS_PRINTBUF_H
-
-/*
- * Printbufs: Simple strings for printing to, with optional heap allocation
- *
- * This code has provisions for use in userspace, to aid in making other code
- * portable between kernelspace and userspace.
- *
- * Basic example:
- * struct printbuf buf = PRINTBUF;
- *
- * prt_printf(&buf, "foo=");
- * foo_to_text(&buf, foo);
- * printk("%s", buf.buf);
- * printbuf_exit(&buf);
- *
- * Or
- * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
- *
- * We can now write pretty printers instead of writing code that dumps
- * everything to the kernel log buffer, and then those pretty-printers can be
- * used by other code that outputs to kernel log, sysfs, debugfs, etc.
- *
- * Memory allocation: Outputing to a printbuf may allocate memory. This
- * allocation is done with GFP_KERNEL, by default: use the newer
- * memalloc_*_(save|restore) functions as needed.
- *
- * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
- * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
- *
- * It's allowed to grab the output buffer and free it later with kfree() instead
- * of using printbuf_exit(), if the user just needs a heap allocated string at
- * the end.
- *
- * Memory allocation failures: We don't return errors directly, because on
- * memory allocation failure we usually don't want to bail out and unwind - we
- * want to print what we've got, on a best-effort basis. But code that does want
- * to return -ENOMEM may check printbuf.allocation_failure.
- *
- * Indenting, tabstops:
- *
- * To aid is writing multi-line pretty printers spread across multiple
- * functions, printbufs track the current indent level.
- *
- * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
- * level, respectively.
- *
- * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
- * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
- * prt_tab_rjust() will also advance the current line of text up to the next
- * tabstop, but it does so by shifting text since the previous tabstop up to the
- * next tabstop - right justifying it.
- *
- * Make sure you use prt_newline() instead of \n in the format string for indent
- * level and tabstops to work corretly.
- *
- * Output units: printbuf->units exists to tell pretty-printers how to output
- * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
- * human readable bytes. prt_units() obeys it.
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-enum printbuf_si {
- PRINTBUF_UNITS_2, /* use binary powers of 2^10 */
- PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */
-};
-
-#define PRINTBUF_INLINE_TABSTOPS 6
-
-struct printbuf {
- char *buf;
- unsigned size;
- unsigned pos;
- unsigned last_newline;
- unsigned last_field;
- unsigned indent;
- /*
- * If nonzero, allocations will be done with GFP_ATOMIC:
- */
- u8 atomic;
- bool allocation_failure:1;
- bool heap_allocated:1;
- bool overflow:1;
- enum printbuf_si si_units:1;
- bool human_readable_units:1;
- bool has_indent_or_tabstops:1;
- bool suppress_indent_tabstop_handling:1;
- u8 nr_tabstops;
-
- /*
- * Do not modify directly: use printbuf_tabstop_add(),
- * printbuf_tabstop_get()
- */
- u8 cur_tabstop;
- u8 _tabstops[PRINTBUF_INLINE_TABSTOPS];
-};
-
-int bch2_printbuf_make_room(struct printbuf *, unsigned);
-__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
-__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
-const char *bch2_printbuf_str(const struct printbuf *);
-void bch2_printbuf_exit(struct printbuf *);
-
-void bch2_printbuf_tabstops_reset(struct printbuf *);
-void bch2_printbuf_tabstop_pop(struct printbuf *);
-int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
-
-void bch2_printbuf_indent_add(struct printbuf *, unsigned);
-void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned);
-void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
-
-void bch2_prt_newline(struct printbuf *);
-void bch2_printbuf_strip_trailing_newline(struct printbuf *);
-void bch2_prt_tab(struct printbuf *);
-void bch2_prt_tab_rjust(struct printbuf *);
-
-void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
-void bch2_prt_human_readable_u64(struct printbuf *, u64);
-void bch2_prt_human_readable_s64(struct printbuf *, s64);
-void bch2_prt_units_u64(struct printbuf *, u64);
-void bch2_prt_units_s64(struct printbuf *, s64);
-void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
-void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
-void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
- unsigned long *, unsigned);
-
-/* Initializer for a heap allocated printbuf: */
-#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
-
-/* Initializer a printbuf that points to an external buffer: */
-#define PRINTBUF_EXTERN(_buf, _size) \
-((struct printbuf) { \
- .buf = _buf, \
- .size = _size, \
-})
-
-static inline struct printbuf bch2_printbuf_init(void)
-{
- return PRINTBUF;
-}
-
-DEFINE_CLASS(printbuf, struct printbuf,
- bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
-
-/*
- * Returns size remaining of output buffer:
- */
-static inline unsigned printbuf_remaining_size(struct printbuf *out)
-{
- if (WARN_ON(out->size && out->pos >= out->size))
- out->pos = out->size - 1;
- return out->size - out->pos;
-}
-
-/*
- * Returns number of characters we can print to the output buffer - i.e.
- * excluding the terminating nul:
- */
-static inline unsigned printbuf_remaining(struct printbuf *out)
-{
- return out->size ? printbuf_remaining_size(out) - 1 : 0;
-}
-
-static inline unsigned printbuf_written(struct printbuf *out)
-{
- return out->size ? min(out->pos, out->size - 1) : 0;
-}
-
-static inline void printbuf_nul_terminate_reserved(struct printbuf *out)
-{
- if (WARN_ON(out->size && out->pos >= out->size))
- out->pos = out->size - 1;
- if (out->size)
- out->buf[out->pos] = 0;
-}
-
-static inline void printbuf_nul_terminate(struct printbuf *out)
-{
- bch2_printbuf_make_room(out, 1);
- printbuf_nul_terminate_reserved(out);
-}
-
-/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
-static inline void __prt_char_reserved(struct printbuf *out, char c)
-{
- if (printbuf_remaining(out))
- out->buf[out->pos++] = c;
-}
-
-/* Doesn't nul terminate: */
-static inline void __prt_char(struct printbuf *out, char c)
-{
- bch2_printbuf_make_room(out, 1);
- __prt_char_reserved(out, c);
-}
-
-static inline void prt_char(struct printbuf *out, char c)
-{
- bch2_printbuf_make_room(out, 2);
- __prt_char_reserved(out, c);
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
-{
- unsigned can_print = min(n, printbuf_remaining(out));
-
- for (unsigned i = 0; i < can_print; i++)
- out->buf[out->pos++] = c;
-}
-
-static inline void prt_chars(struct printbuf *out, char c, unsigned n)
-{
- bch2_printbuf_make_room(out, n);
- __prt_chars_reserved(out, c, n);
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
-{
- bch2_printbuf_make_room(out, n);
-
- unsigned can_print = min(n, printbuf_remaining(out));
-
- for (unsigned i = 0; i < can_print; i++)
- out->buf[out->pos++] = ((char *) b)[i];
-
- printbuf_nul_terminate(out);
-}
-
-static inline void prt_str(struct printbuf *out, const char *str)
-{
- prt_bytes(out, str, strlen(str));
-}
-
-static inline void prt_str_indented(struct printbuf *out, const char *str)
-{
- bch2_prt_bytes_indented(out, str, strlen(str));
-}
-
-static inline void prt_hex_byte(struct printbuf *out, u8 byte)
-{
- bch2_printbuf_make_room(out, 3);
- __prt_char_reserved(out, hex_asc_hi(byte));
- __prt_char_reserved(out, hex_asc_lo(byte));
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
-{
- bch2_printbuf_make_room(out, 3);
- __prt_char_reserved(out, hex_asc_upper_hi(byte));
- __prt_char_reserved(out, hex_asc_upper_lo(byte));
- printbuf_nul_terminate_reserved(out);
-}
-
-static inline void printbuf_reset_keep_tabstops(struct printbuf *buf)
-{
- buf->pos = 0;
- buf->allocation_failure = 0;
- buf->last_newline = 0;
- buf->last_field = 0;
- buf->indent = 0;
- buf->cur_tabstop = 0;
-}
-
-/**
- * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
- */
-static inline void printbuf_reset(struct printbuf *buf)
-{
- printbuf_reset_keep_tabstops(buf);
- buf->nr_tabstops = 0;
-}
-
-/**
- * printbuf_atomic_inc - mark as entering an atomic section
- */
-static inline void printbuf_atomic_inc(struct printbuf *buf)
-{
- buf->atomic++;
-}
-
-/**
- * printbuf_atomic_inc - mark as leaving an atomic section
- */
-static inline void printbuf_atomic_dec(struct printbuf *buf)
-{
- buf->atomic--;
-}
-
-#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
deleted file mode 100644
index d09898566abe..000000000000
--- a/fs/bcachefs/progress.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "disk_accounting.h"
-#include "progress.h"
-
-void bch2_progress_init(struct progress_indicator_state *s,
- struct bch_fs *c,
- u64 btree_id_mask)
-{
- memset(s, 0, sizeof(*s));
-
- s->next_print = jiffies + HZ * 10;
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
- if (!(btree_id_mask & BIT_ULL(i)))
- continue;
-
- struct disk_accounting_pos acc;
- disk_accounting_key_init(acc, btree, .id = i);
-
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
- s->nodes_total += div64_ul(v, btree_sectors(c));
- }
-}
-
-static inline bool progress_update_p(struct progress_indicator_state *s)
-{
- bool ret = time_after_eq(jiffies, s->next_print);
-
- if (ret)
- s->next_print = jiffies + HZ * 10;
- return ret;
-}
-
-void bch2_progress_update_iter(struct btree_trans *trans,
- struct progress_indicator_state *s,
- struct btree_iter *iter,
- const char *msg)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(btree_iter_path(trans, iter))->b;
-
- s->nodes_seen += b != s->last_node;
- s->last_node = b;
-
- if (progress_update_p(s)) {
- struct printbuf buf = PRINTBUF;
- unsigned percent = s->nodes_total
- ? div64_u64(s->nodes_seen * 100, s->nodes_total)
- : 0;
-
- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
- msg, percent, s->nodes_seen, s->nodes_total);
- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-}
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
deleted file mode 100644
index 23fb1811f943..000000000000
--- a/fs/bcachefs/progress.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_PROGRESS_H
-#define _BCACHEFS_PROGRESS_H
-
-/*
- * Lame progress indicators
- *
- * We don't like to use these because they print to the dmesg console, which is
- * spammy - we much prefer to be wired up to a userspace programm (e.g. via
- * thread_with_file) and have it print the progress indicator.
- *
- * But some code is old and doesn't support that, or runs in a context where
- * that's not yet practical (mount).
- */
-
-struct progress_indicator_state {
- unsigned long next_print;
- u64 nodes_seen;
- u64 nodes_total;
- struct btree *last_node;
-};
-
-void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
-void bch2_progress_update_iter(struct btree_trans *,
- struct progress_indicator_state *,
- struct btree_iter *,
- const char *);
-
-#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
deleted file mode 100644
index f241efb1fb50..000000000000
--- a/fs/bcachefs/quota.c
+++ /dev/null
@@ -1,892 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "quota.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-static const char * const bch2_quota_types[] = {
- "user",
- "group",
- "project",
-};
-
-static const char * const bch2_quota_counters[] = {
- "space",
- "inodes",
-};
-
-static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_quota *q = field_to_type(f, quota);
-
- if (vstruct_bytes(&q->field) < sizeof(*q)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&q->field), sizeof(*q));
- return -BCH_ERR_invalid_sb_quota;
- }
-
- return 0;
-}
-
-static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_quota *q = field_to_type(f, quota);
- unsigned qtyp, counter;
-
- for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
- prt_printf(out, "%s: flags %llx",
- bch2_quota_types[qtyp],
- le64_to_cpu(q->q[qtyp].flags));
-
- for (counter = 0; counter < Q_COUNTERS; counter++)
- prt_printf(out, " %s timelimit %u warnlimit %u",
- bch2_quota_counters[counter],
- le32_to_cpu(q->q[qtyp].c[counter].timelimit),
- le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
-
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_quota = {
- .validate = bch2_sb_quota_validate,
- .to_text = bch2_sb_quota_to_text,
-};
-
-int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
- c, quota_type_invalid,
- "invalid quota type (%llu >= %u)",
- k.k->p.inode, QTYP_NR);
-fsck_err:
- return ret;
-}
-
-void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
- unsigned i;
-
- for (i = 0; i < Q_COUNTERS; i++)
- prt_printf(out, "%s hardlimit %llu softlimit %llu",
- bch2_quota_counters[i],
- le64_to_cpu(dq.v->c[i].hardlimit),
- le64_to_cpu(dq.v->c[i].softlimit));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/quota.h>
-
-static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
-{
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 20);
-
- prt_printf(out, "i_fieldmask\t%x\n", i->i_fieldmask);
- prt_printf(out, "i_flags\t%u\n", i->i_flags);
- prt_printf(out, "i_spc_timelimit\t%u\n", i->i_spc_timelimit);
- prt_printf(out, "i_ino_timelimit\t%u\n", i->i_ino_timelimit);
- prt_printf(out, "i_rt_spc_timelimit\t%u\n", i->i_rt_spc_timelimit);
- prt_printf(out, "i_spc_warnlimit\t%u\n", i->i_spc_warnlimit);
- prt_printf(out, "i_ino_warnlimit\t%u\n", i->i_ino_warnlimit);
- prt_printf(out, "i_rt_spc_warnlimit\t%u\n", i->i_rt_spc_warnlimit);
-}
-
-static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
-{
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 20);
-
- prt_printf(out, "d_fieldmask\t%x\n", q->d_fieldmask);
- prt_printf(out, "d_spc_hardlimit\t%llu\n", q->d_spc_hardlimit);
- prt_printf(out, "d_spc_softlimit\t%llu\n", q->d_spc_softlimit);
- prt_printf(out, "d_ino_hardlimit\%llu\n", q->d_ino_hardlimit);
- prt_printf(out, "d_ino_softlimit\t%llu\n", q->d_ino_softlimit);
- prt_printf(out, "d_space\t%llu\n", q->d_space);
- prt_printf(out, "d_ino_count\t%llu\n", q->d_ino_count);
- prt_printf(out, "d_ino_timer\t%llu\n", q->d_ino_timer);
- prt_printf(out, "d_spc_timer\t%llu\n", q->d_spc_timer);
- prt_printf(out, "d_ino_warns\t%i\n", q->d_ino_warns);
- prt_printf(out, "d_spc_warns\t%i\n", q->d_spc_warns);
-}
-
-static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
-{
- qtypes >>= i;
- return qtypes ? i + __ffs(qtypes) : QTYP_NR;
-}
-
-#define for_each_set_qtype(_c, _i, _q, _qtypes) \
- for (_i = 0; \
- (_i = __next_qtype(_i, _qtypes), \
- _q = &(_c)->quotas[_i], \
- _i < QTYP_NR); \
- _i++)
-
-static bool ignore_hardlimit(struct bch_memquota_type *q)
-{
- if (capable(CAP_SYS_RESOURCE))
- return true;
-#if 0
- struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
-
- return capable(CAP_SYS_RESOURCE) &&
- (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
- !(info->dqi_flags & DQF_ROOT_SQUASH));
-#endif
- return false;
-}
-
-enum quota_msg {
- SOFTWARN, /* Softlimit reached */
- SOFTLONGWARN, /* Grace time expired */
- HARDWARN, /* Hardlimit reached */
-
- HARDBELOW, /* Usage got below inode hardlimit */
- SOFTBELOW, /* Usage got below inode softlimit */
-};
-
-static int quota_nl[][Q_COUNTERS] = {
- [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
- [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
- [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
- [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
- [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
-
- [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
- [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
- [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
- [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
- [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
-};
-
-struct quota_msgs {
- u8 nr;
- struct {
- u8 qtype;
- u8 msg;
- } m[QTYP_NR * Q_COUNTERS];
-};
-
-static void prepare_msg(unsigned qtype,
- enum quota_counters counter,
- struct quota_msgs *msgs,
- enum quota_msg msg_type)
-{
- BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
-
- msgs->m[msgs->nr].qtype = qtype;
- msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
- msgs->nr++;
-}
-
-static void prepare_warning(struct memquota_counter *qc,
- unsigned qtype,
- enum quota_counters counter,
- struct quota_msgs *msgs,
- enum quota_msg msg_type)
-{
- if (qc->warning_issued & (1 << msg_type))
- return;
-
- prepare_msg(qtype, counter, msgs, msg_type);
-}
-
-static void flush_warnings(struct bch_qid qid,
- struct super_block *sb,
- struct quota_msgs *msgs)
-{
- unsigned i;
-
- for (i = 0; i < msgs->nr; i++)
- quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
- sb->s_dev, msgs->m[i].msg);
-}
-
-static int bch2_quota_check_limit(struct bch_fs *c,
- unsigned qtype,
- struct bch_memquota *mq,
- struct quota_msgs *msgs,
- enum quota_counters counter,
- s64 v,
- enum quota_acct_mode mode)
-{
- struct bch_memquota_type *q = &c->quotas[qtype];
- struct memquota_counter *qc = &mq->c[counter];
- u64 n = qc->v + v;
-
- BUG_ON((s64) n < 0);
-
- if (mode == KEY_TYPE_QUOTA_NOCHECK)
- return 0;
-
- if (v <= 0) {
- if (n < qc->hardlimit &&
- (qc->warning_issued & (1 << HARDWARN))) {
- qc->warning_issued &= ~(1 << HARDWARN);
- prepare_msg(qtype, counter, msgs, HARDBELOW);
- }
-
- if (n < qc->softlimit &&
- (qc->warning_issued & (1 << SOFTWARN))) {
- qc->warning_issued &= ~(1 << SOFTWARN);
- prepare_msg(qtype, counter, msgs, SOFTBELOW);
- }
-
- qc->warning_issued = 0;
- return 0;
- }
-
- if (qc->hardlimit &&
- qc->hardlimit < n &&
- !ignore_hardlimit(q)) {
- prepare_warning(qc, qtype, counter, msgs, HARDWARN);
- return -EDQUOT;
- }
-
- if (qc->softlimit &&
- qc->softlimit < n) {
- if (qc->timer == 0) {
- qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
- prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
- } else if (ktime_get_real_seconds() >= qc->timer &&
- !ignore_hardlimit(q)) {
- prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
- return -EDQUOT;
- }
- }
-
- return 0;
-}
-
-int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
- enum quota_counters counter, s64 v,
- enum quota_acct_mode mode)
-{
- unsigned qtypes = enabled_qtypes(c);
- struct bch_memquota_type *q;
- struct bch_memquota *mq[QTYP_NR];
- struct quota_msgs msgs;
- unsigned i;
- int ret = 0;
-
- memset(&msgs, 0, sizeof(msgs));
-
- for_each_set_qtype(c, i, q, qtypes) {
- mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
- if (!mq[i])
- return -ENOMEM;
- }
-
- for_each_set_qtype(c, i, q, qtypes)
- mutex_lock_nested(&q->lock, i);
-
- for_each_set_qtype(c, i, q, qtypes) {
- ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
- if (ret)
- goto err;
- }
-
- for_each_set_qtype(c, i, q, qtypes)
- mq[i]->c[counter].v += v;
-err:
- for_each_set_qtype(c, i, q, qtypes)
- mutex_unlock(&q->lock);
-
- flush_warnings(qid, c->vfs_sb, &msgs);
-
- return ret;
-}
-
-static void __bch2_quota_transfer(struct bch_memquota *src_q,
- struct bch_memquota *dst_q,
- enum quota_counters counter, s64 v)
-{
- BUG_ON(v > src_q->c[counter].v);
- BUG_ON(v + dst_q->c[counter].v < v);
-
- src_q->c[counter].v -= v;
- dst_q->c[counter].v += v;
-}
-
-int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
- struct bch_qid dst,
- struct bch_qid src, u64 space,
- enum quota_acct_mode mode)
-{
- struct bch_memquota_type *q;
- struct bch_memquota *src_q[3], *dst_q[3];
- struct quota_msgs msgs;
- unsigned i;
- int ret = 0;
-
- qtypes &= enabled_qtypes(c);
-
- memset(&msgs, 0, sizeof(msgs));
-
- for_each_set_qtype(c, i, q, qtypes) {
- src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
- dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
- if (!src_q[i] || !dst_q[i])
- return -ENOMEM;
- }
-
- for_each_set_qtype(c, i, q, qtypes)
- mutex_lock_nested(&q->lock, i);
-
- for_each_set_qtype(c, i, q, qtypes) {
- ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
- dst_q[i]->c[Q_SPC].v + space,
- mode);
- if (ret)
- goto err;
-
- ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
- dst_q[i]->c[Q_INO].v + 1,
- mode);
- if (ret)
- goto err;
- }
-
- for_each_set_qtype(c, i, q, qtypes) {
- __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
- __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
- }
-
-err:
- for_each_set_qtype(c, i, q, qtypes)
- mutex_unlock(&q->lock);
-
- flush_warnings(dst, c->vfs_sb, &msgs);
-
- return ret;
-}
-
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
- struct qc_dqblk *qdq)
-{
- struct bkey_s_c_quota dq;
- struct bch_memquota_type *q;
- struct bch_memquota *mq;
- unsigned i;
-
- BUG_ON(k.k->p.inode >= QTYP_NR);
-
- if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
- return 0;
-
- switch (k.k->type) {
- case KEY_TYPE_quota:
- dq = bkey_s_c_to_quota(k);
- q = &c->quotas[k.k->p.inode];
-
- mutex_lock(&q->lock);
- mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
- if (!mq) {
- mutex_unlock(&q->lock);
- return -ENOMEM;
- }
-
- for (i = 0; i < Q_COUNTERS; i++) {
- mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
- mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
- }
-
- if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
- mq->c[Q_SPC].timer = qdq->d_spc_timer;
- if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
- mq->c[Q_SPC].warns = qdq->d_spc_warns;
- if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
- mq->c[Q_INO].timer = qdq->d_ino_timer;
- if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
- mq->c[Q_INO].warns = qdq->d_ino_warns;
-
- mutex_unlock(&q->lock);
- }
-
- return 0;
-}
-
-void bch2_fs_quota_exit(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
- genradix_free(&c->quotas[i].table);
-}
-
-void bch2_fs_quota_init(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
- mutex_init(&c->quotas[i].lock);
-}
-
-static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
-{
- struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
-
- if (sb_quota)
- return sb_quota;
-
- sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
- if (sb_quota) {
- unsigned qtype, qc;
-
- for (qtype = 0; qtype < QTYP_NR; qtype++)
- for (qc = 0; qc < Q_COUNTERS; qc++)
- sb_quota->q[qtype].c[qc].timelimit =
- cpu_to_le32(7 * 24 * 60 * 60);
- }
-
- return sb_quota;
-}
-
-static void bch2_sb_quota_read(struct bch_fs *c)
-{
- struct bch_sb_field_quota *sb_quota;
- unsigned i, j;
-
- sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
- if (!sb_quota)
- return;
-
- for (i = 0; i < QTYP_NR; i++) {
- struct bch_memquota_type *q = &c->quotas[i];
-
- for (j = 0; j < Q_COUNTERS; j++) {
- q->limits[j].timelimit =
- le32_to_cpu(sb_quota->q[i].c[j].timelimit);
- q->limits[j].warnlimit =
- le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
- }
- }
-}
-
-static int bch2_fs_quota_read_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bch_inode_unpacked u;
- struct bch_snapshot_tree s_t;
- u32 tree = bch2_snapshot_tree(c, k.k->p.snapshot);
-
- int ret = bch2_snapshot_tree_lookup(trans, tree, &s_t);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__, tree);
- if (ret)
- return ret;
-
- if (!s_t.master_subvol)
- goto advance;
-
- ret = bch2_inode_find_by_inum_nowarn_trans(trans,
- (subvol_inum) {
- le32_to_cpu(s_t.master_subvol),
- k.k->p.offset,
- }, &u);
- /*
- * Inode might be deleted in this snapshot - the easiest way to handle
- * that is to just skip it here:
- */
- if (bch2_err_matches(ret, ENOENT))
- goto advance;
-
- if (ret)
- return ret;
-
- bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
- KEY_TYPE_QUOTA_NOCHECK);
- bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
- KEY_TYPE_QUOTA_NOCHECK);
-advance:
- bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
- return 0;
-}
-
-int bch2_fs_quota_read(struct bch_fs *c)
-{
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
- if (!sb_quota) {
- mutex_unlock(&c->sb_lock);
- return bch_err_throw(c, ENOSPC_sb_quota);
- }
-
- bch2_sb_quota_read(c);
- mutex_unlock(&c->sb_lock);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
- BTREE_ITER_prefetch, k,
- __bch2_quota_set(c, k, NULL)) ?:
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- bch2_fs_quota_read_inode(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* Enable/disable/delete quotas for an entire filesystem: */
-
-static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_sb_field_quota *sb_quota;
- int ret = 0;
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- /* Accounting must be enabled at mount time: */
- if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
- return -EINVAL;
-
- /* Can't enable enforcement without accounting: */
- if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
- return -EINVAL;
-
- if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
- return -EINVAL;
-
- if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
- if (!sb_quota) {
- ret = bch_err_throw(c, ENOSPC_sb_quota);
- goto unlock;
- }
-
- if (uflags & FS_QUOTA_UDQ_ENFD)
- SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
-
- if (uflags & FS_QUOTA_GDQ_ENFD)
- SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
-
- if (uflags & FS_QUOTA_PDQ_ENFD)
- SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
-
- bch2_write_super(c);
-unlock:
- mutex_unlock(&c->sb_lock);
-
- return bch2_err_class(ret);
-}
-
-static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- mutex_lock(&c->sb_lock);
- if (uflags & FS_QUOTA_UDQ_ENFD)
- SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
-
- if (uflags & FS_QUOTA_GDQ_ENFD)
- SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
-
- if (uflags & FS_QUOTA_PDQ_ENFD)
- SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret;
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- if (uflags & FS_USER_QUOTA) {
- if (c->opts.usrquota)
- return -EINVAL;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
- POS(QTYP_USR, 0),
- POS(QTYP_USR, U64_MAX),
- 0, NULL);
- if (ret)
- return ret;
- }
-
- if (uflags & FS_GROUP_QUOTA) {
- if (c->opts.grpquota)
- return -EINVAL;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
- POS(QTYP_GRP, 0),
- POS(QTYP_GRP, U64_MAX),
- 0, NULL);
- if (ret)
- return ret;
- }
-
- if (uflags & FS_PROJ_QUOTA) {
- if (c->opts.prjquota)
- return -EINVAL;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
- POS(QTYP_PRJ, 0),
- POS(QTYP_PRJ, U64_MAX),
- 0, NULL);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * Return quota status information, such as enforcements, quota file inode
- * numbers etc.
- */
-static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
-{
- struct bch_fs *c = sb->s_fs_info;
- unsigned qtypes = enabled_qtypes(c);
- unsigned i;
-
- memset(state, 0, sizeof(*state));
-
- for (i = 0; i < QTYP_NR; i++) {
- state->s_state[i].flags |= QCI_SYSFILE;
-
- if (!(qtypes & (1 << i)))
- continue;
-
- state->s_state[i].flags |= QCI_ACCT_ENABLED;
-
- state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
- state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
-
- state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
- state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
- }
-
- return 0;
-}
-
-/*
- * Adjust quota timers & warnings
- */
-static int bch2_quota_set_info(struct super_block *sb, int type,
- struct qc_info *info)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_sb_field_quota *sb_quota;
- int ret = 0;
-
- if (0) {
- struct printbuf buf = PRINTBUF;
-
- qc_info_to_text(&buf, info);
- pr_info("setting:\n%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- if (type >= QTYP_NR)
- return -EINVAL;
-
- if (!((1 << type) & enabled_qtypes(c)))
- return -ESRCH;
-
- if (info->i_fieldmask &
- ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
- return -EINVAL;
-
- mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
- if (!sb_quota) {
- ret = bch_err_throw(c, ENOSPC_sb_quota);
- goto unlock;
- }
-
- if (info->i_fieldmask & QC_SPC_TIMER)
- sb_quota->q[type].c[Q_SPC].timelimit =
- cpu_to_le32(info->i_spc_timelimit);
-
- if (info->i_fieldmask & QC_SPC_WARNS)
- sb_quota->q[type].c[Q_SPC].warnlimit =
- cpu_to_le32(info->i_spc_warnlimit);
-
- if (info->i_fieldmask & QC_INO_TIMER)
- sb_quota->q[type].c[Q_INO].timelimit =
- cpu_to_le32(info->i_ino_timelimit);
-
- if (info->i_fieldmask & QC_INO_WARNS)
- sb_quota->q[type].c[Q_INO].warnlimit =
- cpu_to_le32(info->i_ino_warnlimit);
-
- bch2_sb_quota_read(c);
-
- bch2_write_super(c);
-unlock:
- mutex_unlock(&c->sb_lock);
-
- return bch2_err_class(ret);
-}
-
-/* Get/set individual quotas: */
-
-static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
-{
- dst->d_space = src->c[Q_SPC].v << 9;
- dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
- dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
- dst->d_spc_timer = src->c[Q_SPC].timer;
- dst->d_spc_warns = src->c[Q_SPC].warns;
-
- dst->d_ino_count = src->c[Q_INO].v;
- dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
- dst->d_ino_softlimit = src->c[Q_INO].softlimit;
- dst->d_ino_timer = src->c[Q_INO].timer;
- dst->d_ino_warns = src->c[Q_INO].warns;
-}
-
-static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
- struct qc_dqblk *qdq)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_memquota_type *q = &c->quotas[kqid.type];
- qid_t qid = from_kqid(&init_user_ns, kqid);
- struct bch_memquota *mq;
-
- memset(qdq, 0, sizeof(*qdq));
-
- mutex_lock(&q->lock);
- mq = genradix_ptr(&q->table, qid);
- if (mq)
- __bch2_quota_get(qdq, mq);
- mutex_unlock(&q->lock);
-
- return 0;
-}
-
-static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
- struct qc_dqblk *qdq)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_memquota_type *q = &c->quotas[kqid->type];
- qid_t qid = from_kqid(&init_user_ns, *kqid);
- struct genradix_iter iter;
- struct bch_memquota *mq;
- int ret = 0;
-
- mutex_lock(&q->lock);
-
- genradix_for_each_from(&q->table, iter, mq, qid)
- if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
- __bch2_quota_get(qdq, mq);
- *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
- goto found;
- }
-
- ret = -ENOENT;
-found:
- mutex_unlock(&q->lock);
- return bch2_err_class(ret);
-}
-
-static int bch2_set_quota_trans(struct btree_trans *trans,
- struct bkey_i_quota *new_quota,
- struct qc_dqblk *qdq)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bkey_err(k);
- if (unlikely(ret))
- return ret;
-
- if (k.k->type == KEY_TYPE_quota)
- new_quota->v = *bkey_s_c_to_quota(k).v;
-
- if (qdq->d_fieldmask & QC_SPC_SOFT)
- new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
- if (qdq->d_fieldmask & QC_SPC_HARD)
- new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
-
- if (qdq->d_fieldmask & QC_INO_SOFT)
- new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
- if (qdq->d_fieldmask & QC_INO_HARD)
- new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
-
- ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
- struct qc_dqblk *qdq)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bkey_i_quota new_quota;
- int ret;
-
- if (0) {
- struct printbuf buf = PRINTBUF;
-
- qc_dqblk_to_text(&buf, qdq);
- pr_info("setting:\n%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- bkey_quota_init(&new_quota.k_i);
- new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
- __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
-
- return bch2_err_class(ret);
-}
-
-const struct quotactl_ops bch2_quotactl_operations = {
- .quota_enable = bch2_quota_enable,
- .quota_disable = bch2_quota_disable,
- .rm_xquota = bch2_quota_remove,
-
- .get_state = bch2_quota_get_state,
- .set_info = bch2_quota_set_info,
-
- .get_dqblk = bch2_get_quota,
- .get_nextdqblk = bch2_get_next_quota,
- .set_dqblk = bch2_set_quota,
-};
-
-#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
deleted file mode 100644
index 1551800ff44c..000000000000
--- a/fs/bcachefs/quota.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_H
-#define _BCACHEFS_QUOTA_H
-
-#include "inode.h"
-#include "quota_types.h"
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-
-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_quota ((struct bkey_ops) { \
- .key_validate = bch2_quota_validate, \
- .val_to_text = bch2_quota_to_text, \
- .min_val_size = 32, \
-})
-
-static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
-{
- return (struct bch_qid) {
- .q[QTYP_USR] = u->bi_uid,
- .q[QTYP_GRP] = u->bi_gid,
- .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
- };
-}
-
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
- return ((c->opts.usrquota << QTYP_USR)|
- (c->opts.grpquota << QTYP_GRP)|
- (c->opts.prjquota << QTYP_PRJ));
-}
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
- s64, enum quota_acct_mode);
-
-int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
- struct bch_qid, u64, enum quota_acct_mode);
-
-void bch2_fs_quota_exit(struct bch_fs *);
-void bch2_fs_quota_init(struct bch_fs *);
-int bch2_fs_quota_read(struct bch_fs *);
-
-extern const struct quotactl_ops bch2_quotactl_operations;
-
-#else
-
-static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
- enum quota_counters counter, s64 v,
- enum quota_acct_mode mode)
-{
- return 0;
-}
-
-static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
- struct bch_qid dst,
- struct bch_qid src, u64 space,
- enum quota_acct_mode mode)
-{
- return 0;
-}
-
-static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
-static inline void bch2_fs_quota_init(struct bch_fs *c) {}
-static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
-
-#endif
-
-#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
deleted file mode 100644
index dc34347ef6c7..000000000000
--- a/fs/bcachefs/quota_format.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_FORMAT_H
-#define _BCACHEFS_QUOTA_FORMAT_H
-
-/* KEY_TYPE_quota: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
deleted file mode 100644
index 6a136083d389..000000000000
--- a/fs/bcachefs/quota_types.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_QUOTA_TYPES_H
-#define _BCACHEFS_QUOTA_TYPES_H
-
-#include <linux/generic-radix-tree.h>
-
-struct bch_qid {
- u32 q[QTYP_NR];
-};
-
-enum quota_acct_mode {
- KEY_TYPE_QUOTA_PREALLOC,
- KEY_TYPE_QUOTA_WARN,
- KEY_TYPE_QUOTA_NOCHECK,
-};
-
-struct memquota_counter {
- u64 v;
- u64 hardlimit;
- u64 softlimit;
- s64 timer;
- int warns;
- int warning_issued;
-};
-
-struct bch_memquota {
- struct memquota_counter c[Q_COUNTERS];
-};
-
-typedef GENRADIX(struct bch_memquota) bch_memquota_table;
-
-struct quota_limit {
- u32 timelimit;
- u32 warnlimit;
-};
-
-struct bch_memquota_type {
- struct quota_limit limits[Q_COUNTERS];
- bch_memquota_table table;
- struct mutex lock;
-};
-
-#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
deleted file mode 100644
index b1438be9d690..000000000000
--- a/fs/bcachefs/rcu_pending.c
+++ /dev/null
@@ -1,666 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) "%s() " fmt "\n", __func__
-
-#include <linux/generic-radix-tree.h>
-#include <linux/mm.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/srcu.h>
-#include <linux/vmalloc.h>
-
-#include "rcu_pending.h"
-#include "darray.h"
-#include "util.h"
-
-#define static_array_for_each(_a, _i) \
- for (typeof(&(_a)[0]) _i = _a; \
- _i < (_a) + ARRAY_SIZE(_a); \
- _i++)
-
-enum rcu_pending_special {
- RCU_PENDING_KVFREE = 1,
- RCU_PENDING_CALL_RCU = 2,
-};
-
-#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
-#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
-
-#ifdef __KERNEL__
-typedef unsigned long rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
- return l == r;
-}
-#else
-typedef struct urcu_gp_poll_state rcu_gp_poll_state_t;
-
-static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
-{
- return l.grace_period_id == r.grace_period_id;
-}
-#endif
-
-static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
-{
- return ssp
- ? get_state_synchronize_srcu(ssp)
- : get_state_synchronize_rcu();
-}
-
-static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
-{
- return ssp
- ? start_poll_synchronize_srcu(ssp)
- : start_poll_synchronize_rcu();
-}
-
-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
-{
- return ssp
- ? poll_state_synchronize_srcu(ssp, cookie)
- : poll_state_synchronize_rcu(cookie);
-}
-
-static inline void __rcu_barrier(struct srcu_struct *ssp)
-{
- return ssp
- ? srcu_barrier(ssp)
- : rcu_barrier();
-}
-
-static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
- rcu_callback_t func)
-{
- if (ssp)
- call_srcu(ssp, rhp, func);
- else
- call_rcu(rhp, func);
-}
-
-struct rcu_pending_seq {
- /*
- * We're using a radix tree like a vector - we're just pushing elements
- * onto the end; we're using a radix tree instead of an actual vector to
- * avoid reallocation overhead
- */
- GENRADIX(struct rcu_head *) objs;
- size_t nr;
- struct rcu_head **cursor;
- rcu_gp_poll_state_t seq;
-};
-
-struct rcu_pending_list {
- struct rcu_head *head;
- struct rcu_head *tail;
- rcu_gp_poll_state_t seq;
-};
-
-struct rcu_pending_pcpu {
- struct rcu_pending *parent;
- spinlock_t lock;
- int cpu;
-
- /*
- * We can't bound the number of unprocessed gp sequence numbers, and we
- * can't efficiently merge radix trees for expired grace periods, so we
- * need darray/vector:
- */
- DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
-
- /* Third entry is for expired objects: */
- struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
-
- struct rcu_head cb;
- bool cb_armed;
- struct work_struct work;
-};
-
-static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
-{
- if (p->objs.nr)
- return true;
-
- static_array_for_each(p->lists, i)
- if (i->head)
- return true;
-
- return false;
-}
-
-static void rcu_pending_list_merge(struct rcu_pending_list *l1,
- struct rcu_pending_list *l2)
-{
-#ifdef __KERNEL__
- if (!l1->head)
- l1->head = l2->head;
- else
- l1->tail->next = l2->head;
-#else
- if (!l1->head)
- l1->head = l2->head;
- else
- l1->tail->next.next = (void *) l2->head;
-#endif
-
- l1->tail = l2->tail;
- l2->head = l2->tail = NULL;
-}
-
-static void rcu_pending_list_add(struct rcu_pending_list *l,
- struct rcu_head *n)
-{
-#ifdef __KERNEL__
- if (!l->head)
- l->head = n;
- else
- l->tail->next = n;
- l->tail = n;
- n->next = NULL;
-#else
- if (!l->head)
- l->head = n;
- else
- l->tail->next.next = (void *) n;
- l->tail = n;
- n->next.next = NULL;
-#endif
-}
-
-static void merge_expired_lists(struct rcu_pending_pcpu *p)
-{
- struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
-
- for (struct rcu_pending_list *i = p->lists; i < expired; i++)
- if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
- rcu_pending_list_merge(expired, i);
-}
-
-#ifndef __KERNEL__
-static inline void kfree_bulk(size_t nr, void ** p)
-{
- while (nr--)
- kfree(*p);
-}
-#endif
-
-static noinline void __process_finished_items(struct rcu_pending *pending,
- struct rcu_pending_pcpu *p,
- unsigned long flags)
-{
- struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
- struct rcu_pending_seq objs = {};
- struct rcu_head *list = NULL;
-
- if (p->objs.nr &&
- __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
- objs = p->objs.data[0];
- darray_remove_item(&p->objs, p->objs.data);
- }
-
- merge_expired_lists(p);
-
- list = expired->head;
- expired->head = expired->tail = NULL;
-
- spin_unlock_irqrestore(&p->lock, flags);
-
- switch ((ulong) pending->process) {
- case RCU_PENDING_KVFREE:
- for (size_t i = 0; i < objs.nr; ) {
- size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
-
- kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
- i += nr_this_node;
- }
- genradix_free(&objs.objs);
-
- while (list) {
- struct rcu_head *obj = list;
-#ifdef __KERNEL__
- list = obj->next;
-#else
- list = (void *) obj->next.next;
-#endif
-
- /*
- * low bit of pointer indicates whether rcu_head needs
- * to be freed - kvfree_rcu_mightsleep()
- */
- BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
-
- void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
- bool free_head = ((unsigned long) obj->func) & 1UL;
-
- kvfree(ptr);
- if (free_head)
- kfree(obj);
- }
-
- break;
-
- case RCU_PENDING_CALL_RCU:
- for (size_t i = 0; i < objs.nr; i++) {
- struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
- obj->func(obj);
- }
- genradix_free(&objs.objs);
-
- while (list) {
- struct rcu_head *obj = list;
-#ifdef __KERNEL__
- list = obj->next;
-#else
- list = (void *) obj->next.next;
-#endif
- obj->func(obj);
- }
- break;
-
- default:
- for (size_t i = 0; i < objs.nr; i++)
- pending->process(pending, *genradix_ptr(&objs.objs, i));
- genradix_free(&objs.objs);
-
- while (list) {
- struct rcu_head *obj = list;
-#ifdef __KERNEL__
- list = obj->next;
-#else
- list = (void *) obj->next.next;
-#endif
- pending->process(pending, obj);
- }
- break;
- }
-}
-
-static bool process_finished_items(struct rcu_pending *pending,
- struct rcu_pending_pcpu *p,
- unsigned long flags)
-{
- /*
- * XXX: we should grab the gp seq once and avoid multiple function
- * calls, this is called from __rcu_pending_enqueue() fastpath in
- * may_sleep==true mode
- */
- if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
- (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
- (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
- p->lists[2].head) {
- __process_finished_items(pending, p, flags);
- return true;
- }
-
- return false;
-}
-
-static void rcu_pending_work(struct work_struct *work)
-{
- struct rcu_pending_pcpu *p =
- container_of(work, struct rcu_pending_pcpu, work);
- struct rcu_pending *pending = p->parent;
- unsigned long flags;
-
- do {
- spin_lock_irqsave(&p->lock, flags);
- } while (process_finished_items(pending, p, flags));
-
- spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void rcu_pending_rcu_cb(struct rcu_head *rcu)
-{
- struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
-
- schedule_work_on(p->cpu, &p->work);
-
- unsigned long flags;
- spin_lock_irqsave(&p->lock, flags);
- if (__rcu_pending_has_pending(p)) {
- spin_unlock_irqrestore(&p->lock, flags);
- __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
- } else {
- p->cb_armed = false;
- spin_unlock_irqrestore(&p->lock, flags);
- }
-}
-
-static __always_inline struct rcu_pending_seq *
-get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
-{
- darray_for_each_reverse(p->objs, objs)
- if (rcu_gp_poll_cookie_eq(objs->seq, seq))
- return objs;
-
- if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
- return NULL;
-
- return &darray_last(p->objs);
-}
-
-static noinline bool
-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
- struct rcu_head *head, void *ptr,
- unsigned long *flags)
-{
- if (ptr) {
- if (!head) {
- /*
- * kvfree_rcu_mightsleep(): we weren't passed an
- * rcu_head, but we need one: use the low bit of the
- * ponter to free to flag that the head needs to be
- * freed as well:
- */
- ptr = (void *)(((unsigned long) ptr)|1UL);
- head = kmalloc(sizeof(*head), __GFP_NOWARN);
- if (!head) {
- spin_unlock_irqrestore(&p->lock, *flags);
- head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
- /*
- * dropped lock, did GFP_KERNEL allocation,
- * check for gp expiration
- */
- if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
- kvfree(--ptr);
- kfree(head);
- spin_lock_irqsave(&p->lock, *flags);
- return false;
- }
- }
- }
-
- head->func = ptr;
- }
-again:
- for (struct rcu_pending_list *i = p->lists;
- i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
- if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
- rcu_pending_list_add(i, head);
- return false;
- }
- }
-
- for (struct rcu_pending_list *i = p->lists;
- i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
- if (!i->head) {
- i->seq = seq;
- rcu_pending_list_add(i, head);
- return true;
- }
- }
-
- merge_expired_lists(p);
- goto again;
-}
-
-/*
- * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
- * pending->pracess) once grace period elapses.
- *
- * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
- * back to a linked list.
- *
- * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
- * process callback
- *
- * - If @ptr and @head are both not NULL, we're kvfree_rcu()
- *
- * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
- *
- * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
- * expired items.
- */
-static __always_inline void
-__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
- void *ptr, bool may_sleep)
-{
-
- struct rcu_pending_pcpu *p;
- struct rcu_pending_seq *objs;
- struct genradix_node *new_node = NULL;
- unsigned long flags;
- bool start_gp = false;
-
- BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
-
- /* We could technically be scheduled before taking the lock and end up
- * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
- * anyways
- *
- * And we have to do it this way to avoid breaking PREEMPT_RT, which
- * redefines how spinlocks work:
- */
- p = raw_cpu_ptr(pending->p);
- spin_lock_irqsave(&p->lock, flags);
- rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
-restart:
- if (may_sleep &&
- unlikely(process_finished_items(pending, p, flags)))
- goto check_expired;
-
- /*
- * In kvfree_rcu() mode, the radix tree is only for slab pointers so
- * that we can do kfree_bulk() - vmalloc pointers always use the linked
- * list:
- */
- if (ptr && unlikely(is_vmalloc_addr(ptr)))
- goto list_add;
-
- objs = get_object_radix(p, seq);
- if (unlikely(!objs))
- goto list_add;
-
- if (unlikely(!objs->cursor)) {
- /*
- * New radix tree nodes must be added under @p->lock because the
- * tree root is in a darray that can be resized (typically,
- * genradix supports concurrent unlocked allocation of new
- * nodes) - hence preallocation and the retry loop:
- */
- objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
- objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
- if (unlikely(!objs->cursor)) {
- if (may_sleep) {
- spin_unlock_irqrestore(&p->lock, flags);
-
- gfp_t gfp = GFP_KERNEL;
- if (!head)
- gfp |= __GFP_NOFAIL;
-
- new_node = genradix_alloc_node(gfp);
- if (!new_node)
- may_sleep = false;
- goto check_expired;
- }
-list_add:
- start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
- goto start_gp;
- }
- }
-
- *objs->cursor++ = ptr ?: head;
- /* zero cursor if we hit the end of a radix tree node: */
- if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
- objs->cursor = NULL;
- start_gp = !objs->nr;
- objs->nr++;
-start_gp:
- if (unlikely(start_gp)) {
- /*
- * We only have one callback (ideally, we would have one for
- * every outstanding graceperiod) - so if our callback is
- * already in flight, we may still have to start a grace period
- * (since we used get_state() above, not start_poll())
- */
- if (!p->cb_armed) {
- p->cb_armed = true;
- __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
- } else {
- __start_poll_synchronize_rcu(pending->srcu);
- }
- }
- spin_unlock_irqrestore(&p->lock, flags);
-free_node:
- if (new_node)
- genradix_free_node(new_node);
- return;
-check_expired:
- if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
- switch ((ulong) pending->process) {
- case RCU_PENDING_KVFREE:
- kvfree(ptr);
- break;
- case RCU_PENDING_CALL_RCU:
- head->func(head);
- break;
- default:
- pending->process(pending, head);
- break;
- }
- goto free_node;
- }
-
- p = raw_cpu_ptr(pending->p);
- spin_lock_irqsave(&p->lock, flags);
- goto restart;
-}
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
-{
- __rcu_pending_enqueue(pending, obj, NULL, true);
-}
-
-static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
-{
- struct rcu_head *ret = NULL;
-
- spin_lock_irq(&p->lock);
- darray_for_each(p->objs, objs)
- if (objs->nr) {
- ret = *genradix_ptr(&objs->objs, --objs->nr);
- objs->cursor = NULL;
- if (!objs->nr)
- genradix_free(&objs->objs);
- goto out;
- }
-
- static_array_for_each(p->lists, i)
- if (i->head) {
- ret = i->head;
-#ifdef __KERNEL__
- i->head = ret->next;
-#else
- i->head = (void *) ret->next.next;
-#endif
- if (!i->head)
- i->tail = NULL;
- goto out;
- }
-out:
- spin_unlock_irq(&p->lock);
-
- return ret;
-}
-
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
-{
- return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
-}
-
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
-{
- struct rcu_head *ret = rcu_pending_dequeue(pending);
-
- if (ret)
- return ret;
-
- int cpu;
- for_each_possible_cpu(cpu) {
- ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
- if (ret)
- break;
- }
- return ret;
-}
-
-static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
-{
- int cpu;
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- spin_lock_irq(&p->lock);
- if (__rcu_pending_has_pending(p) || p->cb_armed) {
- spin_unlock_irq(&p->lock);
- return true;
- }
- spin_unlock_irq(&p->lock);
- }
-
- return false;
-}
-
-void rcu_pending_exit(struct rcu_pending *pending)
-{
- int cpu;
-
- if (!pending->p)
- return;
-
- while (rcu_pending_has_pending_or_armed(pending)) {
- __rcu_barrier(pending->srcu);
-
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- flush_work(&p->work);
- }
- }
-
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- flush_work(&p->work);
- }
-
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
-
- static_array_for_each(p->lists, i)
- WARN_ON(i->head);
- WARN_ON(p->objs.nr);
- darray_exit(&p->objs);
- }
- free_percpu(pending->p);
-}
-
-/**
- * rcu_pending_init: - initialize a rcu_pending
- *
- * @pending: Object to init
- * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
- * RCU flavor
- * @process: Callback function invoked on objects once their RCU barriers
- * have completed; if NULL, kvfree() is used.
- */
-int rcu_pending_init(struct rcu_pending *pending,
- struct srcu_struct *srcu,
- rcu_pending_process_fn process)
-{
- pending->p = alloc_percpu(struct rcu_pending_pcpu);
- if (!pending->p)
- return -ENOMEM;
-
- int cpu;
- for_each_possible_cpu(cpu) {
- struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
- p->parent = pending;
- p->cpu = cpu;
- spin_lock_init(&p->lock);
- darray_init(&p->objs);
- INIT_WORK(&p->work, rcu_pending_work);
- }
-
- pending->srcu = srcu;
- pending->process = process;
-
- return 0;
-}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
deleted file mode 100644
index 71a2f4ddaade..000000000000
--- a/fs/bcachefs/rcu_pending.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_RCU_PENDING_H
-#define _LINUX_RCU_PENDING_H
-
-#include <linux/rcupdate.h>
-
-struct rcu_pending;
-typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
-
-struct rcu_pending_pcpu;
-
-struct rcu_pending {
- struct rcu_pending_pcpu __percpu *p;
- struct srcu_struct *srcu;
- rcu_pending_process_fn process;
-};
-
-void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
-struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
-struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
-
-void rcu_pending_exit(struct rcu_pending *pending);
-int rcu_pending_init(struct rcu_pending *pending,
- struct srcu_struct *srcu,
- rcu_pending_process_fn process);
-
-#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
deleted file mode 100644
index 1c345b86b1c0..000000000000
--- a/fs/bcachefs/rebalance.c
+++ /dev/null
@@ -1,889 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_groups.h"
-#include "errcode.h"
-#include "error.h"
-#include "inode.h"
-#include "io_write.h"
-#include "move.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-
-/* bch_extent_rebalance: */
-
-static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
-{
- const union bch_extent_entry *entry;
-
- bkey_extent_entry_for_each(ptrs, entry)
- if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
- return &entry->rebalance;
-
- return NULL;
-}
-
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
-{
- return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
-}
-
-static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s_c k,
- struct bkey_ptrs_c ptrs)
-{
- if (!opts->background_compression)
- return 0;
-
- unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned ptr_bit = 1;
- unsigned rewrite_ptrs = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten)
- return 0;
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
-
- return rewrite_ptrs;
-}
-
-static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_ptrs_c ptrs)
-{
- if (!opts->background_target ||
- !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
- return 0;
-
- unsigned ptr_bit = 1;
- unsigned rewrite_ptrs = 0;
-
- guard(rcu)();
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
- rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
-
- return rewrite_ptrs;
-}
-
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
- struct bch_io_opts *opts,
- struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return 0;
-
- return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
- bch2_bkey_ptrs_need_move(c, opts, ptrs);
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
- if (!opts)
- return 0;
-
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return 0;
-
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- u64 sectors = 0;
-
- if (opts->background_compression) {
- unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten) {
- sectors = 0;
- goto incompressible;
- }
-
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- sectors += p.crc.compressed_size;
- }
- }
-incompressible:
- if (opts->background_target) {
- guard(rcu)();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
- sectors += p.crc.compressed_size;
- }
-
- return sectors;
-}
-
-static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
- struct bkey_s_c k)
-{
- if (!bkey_extent_is_direct_data(k.k))
- return 0;
-
- const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-
- if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
- return old == NULL || memcmp(old, &new, sizeof(new));
- } else {
- return old != NULL;
- }
-}
-
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
- struct bkey_i *_k)
-{
- if (!bkey_extent_is_direct_data(&_k->k))
- return 0;
-
- struct bkey_s k = bkey_i_to_s(_k);
- struct bch_extent_rebalance *old =
- (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
-
- if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
- if (!old) {
- old = bkey_val_end(k);
- k.k->u64s += sizeof(*old) / sizeof(u64);
- }
-
- *old = io_opts_to_rebalance_opts(c, opts);
- } else {
- if (old)
- extent_entry_drop(k, (union bch_extent_entry *) old);
- }
-
- return 0;
-}
-
-int bch2_get_update_rebalance_opts(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- BUG_ON(iter->flags & BTREE_ITER_is_extents);
- BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
-
- const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
- ? bch2_bkey_rebalance_opts(k) : NULL;
- if (r) {
-#define x(_name) \
- if (r->_name##_from_inode) { \
- io_opts->_name = r->_name; \
- io_opts->_name##_from_inode = true; \
- }
- BCH_REBALANCE_OPTS()
-#undef x
- }
-
- if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
- return 0;
-
- struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- bkey_reassemble(n, k);
-
- /* On successfull transaction commit, @k was invalidated: */
-
- return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
- bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
-}
-
-#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
-
-static const char * const bch2_rebalance_state_strs[] = {
-#define x(t) #t,
- BCH_REBALANCE_STATES()
- NULL
-#undef x
-};
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i_cookie *cookie;
- u64 v;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
- SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- v = k.k->type == KEY_TYPE_cookie
- ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
- : 0;
-
- cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
- ret = PTR_ERR_OR_ZERO(cookie);
- if (ret)
- goto err;
-
- bkey_cookie_init(&cookie->k_i);
- cookie->k.p = iter.pos;
- cookie->v.cookie = cpu_to_le64(v + 1);
-
- ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
-{
- int ret = bch2_trans_commit_do(c, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
- bch2_set_rebalance_needs_scan_trans(trans, inum));
- bch2_rebalance_wakeup(c);
- return ret;
-}
-
-int bch2_set_fs_needs_rebalance(struct bch_fs *c)
-{
- return bch2_set_rebalance_needs_scan(c, 0);
-}
-
-static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 v;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
- SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- v = k.k->type == KEY_TYPE_cookie
- ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
- : 0;
-
- if (v == cookie)
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
- struct btree_iter *work_iter)
-{
- return !kthread_should_stop()
- ? bch2_btree_iter_peek(trans, work_iter)
- : bkey_s_c_null;
-}
-
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
- return 0;
-
- struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
- int ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- return ret;
-
- extent_entry_drop(bkey_i_to_s(n),
- (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
-static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
- struct bpos work_pos,
- struct btree_iter *extent_iter,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bch_fs *c = trans->c;
-
- bch2_trans_iter_exit(trans, extent_iter);
- bch2_trans_iter_init(trans, extent_iter,
- work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
- work_pos,
- BTREE_ITER_all_snapshots);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
- if (bkey_err(k))
- return k;
-
- int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
- if (ret)
- return bkey_s_c_err(ret);
-
- memset(data_opts, 0, sizeof(*data_opts));
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
- data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_only_specified_devs;
-
- if (!data_opts->rewrite_ptrs) {
- /*
- * device we would want to write to offline? devices in target
- * changed?
- *
- * We'll now need a full scan before this extent is picked up
- * again:
- */
- int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
- if (ret)
- return bkey_s_c_err(ret);
- return bkey_s_c_null;
- }
-
- if (trace_rebalance_extent_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
- if (p) {
- prt_str(&buf, "compression=");
- bch2_compression_opt_to_text(&buf, io_opts->background_compression);
- prt_str(&buf, " ");
- bch2_prt_u64_base2(&buf, p);
- prt_newline(&buf);
- }
-
- p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
- if (p) {
- prt_str(&buf, "move=");
- bch2_target_to_text(&buf, c, io_opts->background_target);
- prt_str(&buf, " ");
- bch2_prt_u64_base2(&buf, p);
- prt_newline(&buf);
- }
-
- trace_rebalance_extent(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- return k;
-}
-
-noinline_for_stack
-static int do_rebalance_extent(struct moving_context *ctxt,
- struct bpos work_pos,
- struct btree_iter *extent_iter)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct bch_fs_rebalance *r = &trans->c->rebalance;
- struct data_update_opts data_opts;
- struct bch_io_opts io_opts;
- struct bkey_s_c k;
- struct bkey_buf sk;
- int ret;
-
- ctxt->stats = &r->work_stats;
- r->state = BCH_REBALANCE_working;
-
- bch2_bkey_buf_init(&sk);
-
- ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &io_opts, &data_opts));
- if (ret || !k.k)
- goto out;
-
- atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
-
- /*
- * The iterator gets unlocked by __bch2_read_extent - need to
- * save a copy of @k elsewhere:
- */
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
- if (ret) {
- if (bch2_err_matches(ret, ENOMEM)) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- ret = bch_err_throw(c, transaction_restart_nested);
- }
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto out;
-
- /* skip it and continue, XXX signal failure */
- ret = 0;
- }
-out:
- bch2_bkey_buf_exit(&sk, c);
- return ret;
-}
-
-static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct bch_fs_rebalance *r = &trans->c->rebalance;
-
- bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- ctxt->stats = &r->scan_stats;
-
- if (!inum) {
- r->scan_start = BBPOS_MIN;
- r->scan_end = BBPOS_MAX;
- } else {
- r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
- r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
- }
-
- r->state = BCH_REBALANCE_scanning;
-
- struct per_snapshot_io_opts snapshot_io_opts;
- per_snapshot_io_opts_init(&snapshot_io_opts, c);
-
- int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- r->scan_start.pos, r->scan_end.pos,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_not_extents|
- BTREE_ITER_prefetch, k, ({
- ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
-
- struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
- &snapshot_io_opts, iter.pos, &iter, k);
- PTR_ERR_OR_ZERO(io_opts);
- })) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_rebalance_needs_scan(trans, inum, cookie));
-
- per_snapshot_io_opts_exit(&snapshot_io_opts);
- bch2_move_stats_exit(&r->scan_stats, trans->c);
-
- /*
- * Ensure that the rebalance_work entries we created are seen by the
- * next iteration of do_rebalance(), so we don't end up stuck in
- * rebalance_wait():
- */
- atomic64_inc(&r->scan_stats.sectors_seen);
- bch2_btree_write_buffer_flush_sync(trans);
-
- return ret;
-}
-
-static void rebalance_wait(struct bch_fs *c)
-{
- struct bch_fs_rebalance *r = &c->rebalance;
- struct io_clock *clock = &c->io_clock[WRITE];
- u64 now = atomic64_read(&clock->now);
- u64 min_member_capacity = bch2_min_rw_member_capacity(c);
-
- if (min_member_capacity == U64_MAX)
- min_member_capacity = 128 * 2048;
-
- r->wait_iotime_end = now + (min_member_capacity >> 6);
-
- if (r->state != BCH_REBALANCE_waiting) {
- r->wait_iotime_start = now;
- r->wait_wallclock_start = ktime_get_real_ns();
- r->state = BCH_REBALANCE_waiting;
- }
-
- bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
-}
-
-static bool bch2_rebalance_enabled(struct bch_fs *c)
-{
- return c->opts.rebalance_enabled &&
- !(c->opts.rebalance_on_ac_only &&
- c->rebalance.on_battery);
-}
-
-static int do_rebalance(struct moving_context *ctxt)
-{
- struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter rebalance_work_iter, extent_iter = {};
- struct bkey_s_c k;
- u32 kick = r->kick;
- int ret = 0;
-
- bch2_trans_begin(trans);
-
- bch2_move_stats_init(&r->work_stats, "rebalance_work");
- bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-
- bch2_trans_iter_init(trans, &rebalance_work_iter,
- BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_all_snapshots);
-
- while (!bch2_move_ratelimit(ctxt)) {
- if (!bch2_rebalance_enabled(c)) {
- bch2_moving_ctxt_flush_all(ctxt);
- kthread_wait_freezable(bch2_rebalance_enabled(c) ||
- kthread_should_stop());
- }
-
- if (kthread_should_stop())
- break;
-
- bch2_trans_begin(trans);
-
- ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret || !k.k)
- break;
-
- ret = k.k->type == KEY_TYPE_cookie
- ? do_rebalance_scan(ctxt, k.k->p.inode,
- le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
- : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_btree_iter_advance(trans, &rebalance_work_iter);
- }
-
- bch2_trans_iter_exit(trans, &extent_iter);
- bch2_trans_iter_exit(trans, &rebalance_work_iter);
- bch2_move_stats_exit(&r->scan_stats, c);
-
- if (!ret &&
- !kthread_should_stop() &&
- !atomic64_read(&r->work_stats.sectors_seen) &&
- !atomic64_read(&r->scan_stats.sectors_seen) &&
- kick == r->kick) {
- bch2_moving_ctxt_flush_all(ctxt);
- bch2_trans_unlock_long(trans);
- rebalance_wait(c);
- }
-
- if (!bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
- struct bch_fs *c = arg;
- struct bch_fs_rebalance *r = &c->rebalance;
- struct moving_context ctxt;
-
- set_freezable();
-
- /*
- * Data move operations can't run until after check_snapshots has
- * completed, and bch2_snapshot_is_ancestor() is available.
- */
- kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
- kthread_should_stop());
-
- bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
- writepoint_ptr(&c->rebalance_write_point),
- true);
-
- while (!kthread_should_stop() && !do_rebalance(&ctxt))
- ;
-
- bch2_moving_ctxt_exit(&ctxt);
-
- return 0;
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
- printbuf_tabstop_push(out, 32);
-
- struct bch_fs_rebalance *r = &c->rebalance;
-
- /* print pending work */
- struct disk_accounting_pos acc;
- disk_accounting_key_init(acc, rebalance_work);
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
-
- prt_printf(out, "pending work:\t");
- prt_human_readable_u64(out, v << 9);
- prt_printf(out, "\n\n");
-
- prt_str(out, bch2_rebalance_state_strs[r->state]);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- switch (r->state) {
- case BCH_REBALANCE_waiting: {
- u64 now = atomic64_read(&c->io_clock[WRITE].now);
-
- prt_printf(out, "io wait duration:\t");
- bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
- prt_newline(out);
-
- prt_printf(out, "io wait remaining:\t");
- bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
- prt_newline(out);
-
- prt_printf(out, "duration waited:\t");
- bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
- prt_newline(out);
- break;
- }
- case BCH_REBALANCE_working:
- bch2_move_stats_to_text(out, &r->work_stats);
- break;
- case BCH_REBALANCE_scanning:
- bch2_move_stats_to_text(out, &r->scan_stats);
- break;
- }
- prt_newline(out);
-
- struct task_struct *t;
- scoped_guard(rcu) {
- t = rcu_dereference(c->rebalance.thread);
- if (t)
- get_task_struct(t);
- }
-
- if (t) {
- bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
- put_task_struct(t);
- }
-
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
- struct task_struct *p;
-
- c->rebalance.pd.rate.rate = UINT_MAX;
- bch2_ratelimit_reset(&c->rebalance.pd.rate);
-
- p = rcu_dereference_protected(c->rebalance.thread, 1);
- c->rebalance.thread = NULL;
-
- if (p) {
- /* for sychronizing with bch2_rebalance_wakeup() */
- synchronize_rcu();
-
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
- struct task_struct *p;
- int ret;
-
- if (c->rebalance.thread)
- return 0;
-
- if (c->opts.nochanges)
- return 0;
-
- p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
- ret = PTR_ERR_OR_ZERO(p);
- bch_err_msg(c, ret, "creating rebalance thread");
- if (ret)
- return ret;
-
- get_task_struct(p);
- rcu_assign_pointer(c->rebalance.thread, p);
- wake_up_process(p);
- return 0;
-}
-
-#ifdef CONFIG_POWER_SUPPLY
-#include <linux/power_supply.h>
-
-static int bch2_rebalance_power_notifier(struct notifier_block *nb,
- unsigned long event, void *data)
-{
- struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
-
- c->rebalance.on_battery = !power_supply_is_system_supplied();
- bch2_rebalance_wakeup(c);
- return NOTIFY_OK;
-}
-#endif
-
-void bch2_fs_rebalance_exit(struct bch_fs *c)
-{
-#ifdef CONFIG_POWER_SUPPLY
- power_supply_unreg_notifier(&c->rebalance.power_notifier);
-#endif
-}
-
-int bch2_fs_rebalance_init(struct bch_fs *c)
-{
- struct bch_fs_rebalance *r = &c->rebalance;
-
- bch2_pd_controller_init(&r->pd);
-
-#ifdef CONFIG_POWER_SUPPLY
- r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
- int ret = power_supply_reg_notifier(&r->power_notifier);
- if (ret)
- return ret;
-
- r->on_battery = !power_supply_is_system_supplied();
-#endif
- return 0;
-}
-
-static int check_rebalance_work_one(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct btree_iter *rebalance_iter,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c extent_k, rebalance_k;
- struct printbuf buf = PRINTBUF;
-
- int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
- bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
- if (ret)
- return ret;
-
- if (!extent_k.k &&
- extent_iter->btree_id == BTREE_ID_reflink &&
- (!rebalance_k.k ||
- rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
- bch2_trans_iter_exit(trans, extent_iter);
- bch2_trans_iter_init(trans, extent_iter,
- BTREE_ID_extents, POS_MIN,
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots);
- return bch_err_throw(c, transaction_restart_nested);
- }
-
- if (!extent_k.k && !rebalance_k.k)
- return 1;
-
- int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
- rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
-
- struct bkey deleted;
- bkey_init(&deleted);
-
- if (cmp < 0) {
- deleted.p = extent_k.k->p;
- rebalance_k.k = &deleted;
- } else if (cmp > 0) {
- deleted.p = rebalance_k.k->p;
- extent_k.k = &deleted;
- }
-
- bool should_have_rebalance =
- bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
- bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
-
- if (should_have_rebalance != have_rebalance) {
- ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
- if (ret)
- return ret;
-
- bch2_bkey_val_to_text(&buf, c, extent_k);
- }
-
- if (fsck_err_on(!should_have_rebalance && have_rebalance,
- trans, rebalance_work_incorrectly_set,
- "rebalance work incorrectly set\n%s", buf.buf)) {
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- extent_k.k->p, false);
- if (ret)
- goto err;
- }
-
- if (fsck_err_on(should_have_rebalance && !have_rebalance,
- trans, rebalance_work_incorrectly_unset,
- "rebalance work incorrectly unset\n%s", buf.buf)) {
- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- extent_k.k->p, true);
- if (ret)
- goto err;
- }
-
- if (cmp <= 0)
- bch2_btree_iter_advance(trans, extent_iter);
- if (cmp >= 0)
- bch2_btree_iter_advance(trans, rebalance_iter);
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_rebalance_work(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter rebalance_iter, extent_iter;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &extent_iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch);
- bch2_trans_iter_init(trans, &rebalance_iter,
- BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_prefetch);
-
- struct bkey_buf last_flushed;
- bch2_bkey_buf_init(&last_flushed);
- bkey_init(&last_flushed.k->k);
-
- while (!ret) {
- bch2_trans_begin(trans);
-
- ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- }
-
- bch2_bkey_buf_exit(&last_flushed, c);
- bch2_trans_iter_exit(trans, &extent_iter);
- bch2_trans_iter_exit(trans, &rebalance_iter);
- bch2_trans_put(trans);
- return ret < 0 ? ret : 0;
-}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
deleted file mode 100644
index 7a565ea7dbfc..000000000000
--- a/fs/bcachefs/rebalance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_H
-#define _BCACHEFS_REBALANCE_H
-
-#include "compress.h"
-#include "disk_groups.h"
-#include "opts.h"
-#include "rebalance_types.h"
-
-static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
- struct bch_io_opts *opts)
-{
- struct bch_extent_rebalance r = {
- .type = BIT(BCH_EXTENT_ENTRY_rebalance),
-#define x(_name) \
- ._name = opts->_name, \
- ._name##_from_inode = opts->_name##_from_inode,
- BCH_REBALANCE_OPTS()
-#undef x
- };
-
- if (r.background_target &&
- !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
- r.background_target = 0;
-
- return r;
-};
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
-int bch2_get_update_rebalance_opts(struct btree_trans *,
- struct bch_io_opts *,
- struct btree_iter *,
- struct bkey_s_c);
-
-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
-int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
-int bch2_set_fs_needs_rebalance(struct bch_fs *);
-
-static inline void bch2_rebalance_wakeup(struct bch_fs *c)
-{
- c->rebalance.kick++;
- guard(rcu)();
- struct task_struct *p = rcu_dereference(c->rebalance.thread);
- if (p)
- wake_up_process(p);
-}
-
-void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_rebalance_stop(struct bch_fs *);
-int bch2_rebalance_start(struct bch_fs *);
-
-void bch2_fs_rebalance_exit(struct bch_fs *);
-int bch2_fs_rebalance_init(struct bch_fs *);
-
-int bch2_check_rebalance_work(struct bch_fs *);
-
-#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
deleted file mode 100644
index ff9a1342a22b..000000000000
--- a/fs/bcachefs/rebalance_format.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_FORMAT_H
-#define _BCACHEFS_REBALANCE_FORMAT_H
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:3,
-
- promote_target_from_inode:1,
- erasure_code_from_inode:1,
- data_checksum_from_inode:1,
- background_compression_from_inode:1,
- data_replicas_from_inode:1,
- background_target_from_inode:1,
-
- promote_target:16,
- erasure_code:1,
- data_checksum:4,
- data_replicas:4,
- background_compression:8, /* enum bch_compression_opt */
- background_target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 background_target:16,
- background_compression:8,
- data_replicas:4,
- data_checksum:4,
- erasure_code:1,
- promote_target:16,
-
- background_target_from_inode:1,
- data_replicas_from_inode:1,
- background_compression_from_inode:1,
- data_checksum_from_inode:1,
- erasure_code_from_inode:1,
- promote_target_from_inode:1,
-
- unused:3,
- type:6;
-#endif
-};
-
-/* subset of BCH_INODE_OPTS */
-#define BCH_REBALANCE_OPTS() \
- x(data_checksum) \
- x(background_compression) \
- x(data_replicas) \
- x(promote_target) \
- x(background_target) \
- x(erasure_code)
-
-#endif /* _BCACHEFS_REBALANCE_FORMAT_H */
-
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
deleted file mode 100644
index c659da149fa3..000000000000
--- a/fs/bcachefs/rebalance_types.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REBALANCE_TYPES_H
-#define _BCACHEFS_REBALANCE_TYPES_H
-
-#include "bbpos_types.h"
-#include "move_types.h"
-
-#define BCH_REBALANCE_STATES() \
- x(waiting) \
- x(working) \
- x(scanning)
-
-enum bch_rebalance_states {
-#define x(t) BCH_REBALANCE_##t,
- BCH_REBALANCE_STATES()
-#undef x
-};
-
-struct bch_fs_rebalance {
- struct task_struct __rcu *thread;
- u32 kick;
- struct bch_pd_controller pd;
-
- enum bch_rebalance_states state;
- u64 wait_iotime_start;
- u64 wait_iotime_end;
- u64 wait_wallclock_start;
-
- struct bch_move_stats work_stats;
-
- struct bbpos scan_start;
- struct bbpos scan_end;
- struct bch_move_stats scan_stats;
-
- bool on_battery;
-#ifdef CONFIG_POWER_SUPPLY
- struct notifier_block power_notifier;
-#endif
-};
-
-#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
deleted file mode 100644
index c94debb12d2f..000000000000
--- a/fs/bcachefs/recovery.c
+++ /dev/null
@@ -1,1306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "bkey_buf.h"
-#include "btree_journal_iter.h"
-#include "btree_node_scan.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "buckets.h"
-#include "dirent.h"
-#include "disk_accounting.h"
-#include "errcode.h"
-#include "error.h"
-#include "journal_io.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "logged_ops.h"
-#include "move.h"
-#include "movinggc.h"
-#include "namei.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-downgrade.h"
-#include "snapshot.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-#include <linux/stat.h>
-
-int bch2_btree_lost_data(struct bch_fs *c,
- struct printbuf *msg,
- enum btree_id btree)
-{
- u64 b = BIT_ULL(btree);
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- if (!(c->sb.btrees_lost_data & b)) {
- prt_printf(msg, "flagging btree ");
- bch2_btree_id_to_text(msg, btree);
- prt_printf(msg, " lost data\n");
-
- ext->btrees_lost_data |= cpu_to_le64(b);
- }
-
- /* Once we have runtime self healing for topology errors we won't need this: */
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
-
- /* Btree node accounting will be off: */
- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- /*
- * These are much more minor, and don't need to be corrected right away,
- * but in debug mode we want the next fsck run to be clean:
- */
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
-#endif
-
- switch (btree) {
- case BTREE_ID_alloc:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
- goto out;
- case BTREE_ID_backpointers:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
- goto out;
- case BTREE_ID_need_discard:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
- goto out;
- case BTREE_ID_freespace:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
- goto out;
- case BTREE_ID_bucket_gens:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
- goto out;
- case BTREE_ID_lru:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
- goto out;
- case BTREE_ID_accounting:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
- goto out;
- case BTREE_ID_snapshots:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
- goto out;
- default:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
- goto out;
- }
-out:
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-static void kill_btree(struct bch_fs *c, enum btree_id btree)
-{
- bch2_btree_id_root(c, btree)->alive = false;
- bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-}
-
-/* for -o reconstruct_alloc: */
-void bch2_reconstruct_alloc(struct bch_fs *c)
-{
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
-
- __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_dev_usage_buckets_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_dev_usage_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_dev_usage_fragmented_wrong, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_btree_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_cached_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-
- c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
- c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
- if (btree_id_is_alloc(i))
- kill_btree(c, i);
-}
-
-/*
- * Btree node pointers have a field to stack a pointer to the in memory btree
- * node; we need to zero out this field when reading in btree nodes, or when
- * reading in keys from the journal:
- */
-static void zero_out_btree_mem_ptr(struct journal_keys *keys)
-{
- darray_for_each(*keys, i)
- if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
-}
-
-/* journal replay: */
-
-static void replay_now_at(struct journal *j, u64 seq)
-{
- BUG_ON(seq < j->replay_journal_seq);
-
- seq = min(seq, j->replay_journal_seq_end);
-
- while (j->replay_journal_seq < seq)
- bch2_journal_pin_put(j, j->replay_journal_seq++);
-}
-
-static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
- struct journal_key *k)
-{
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
- BTREE_MAX_DEPTH, k->level,
- BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(trans, &iter);
- if (ret)
- goto out;
-
- struct bkey u;
- struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
-
- /* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
- ret = 0;
- goto out;
- }
-
- struct bkey_i *new = k->k;
- if (old.k->type == KEY_TYPE_accounting) {
- new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- bch2_accounting_accumulate(bkey_i_to_accounting(new),
- bkey_s_c_to_accounting(old));
- }
-
- trans->journal_res.seq = k->journal_seq;
-
- ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_journal_replay_key(struct btree_trans *trans,
- struct journal_key *k)
-{
- struct btree_iter iter;
- unsigned iter_flags =
- BTREE_ITER_intent|
- BTREE_ITER_not_extents;
- unsigned update_flags = BTREE_TRIGGER_norun;
- int ret;
-
- if (k->overwritten)
- return 0;
-
- trans->journal_res.seq = k->journal_seq;
-
- /*
- * BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
- * keep the key cache coherent with the underlying btree. Nothing
- * besides the allocator is doing updates yet so we don't need key cache
- * coherency for non-alloc btrees, and key cache fills for snapshots
- * btrees use BTREE_ITER_filter_snapshots, which isn't available until
- * the snapshots recovery pass runs.
- */
- if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_cached;
- else
- update_flags |= BTREE_UPDATE_key_cache_reclaim;
-
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
- BTREE_MAX_DEPTH, k->level,
- iter_flags);
- ret = bch2_btree_iter_traverse(trans, &iter);
- if (ret)
- goto out;
-
- struct btree_path *path = btree_iter_path(trans, &iter);
- if (unlikely(!btree_path_node(path, k->level))) {
- struct bch_fs *c = trans->c;
-
- CLASS(printbuf, buf)();
- prt_str(&buf, "btree=");
- bch2_btree_id_to_text(&buf, k->btree_id);
- prt_printf(&buf, " level=%u ", k->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
-
- if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
- BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
- bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
- buf.buf);
- ret = -EINVAL;
- }
-
- if (!k->allocated) {
- bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
- buf.buf);
- k->overwritten = true;
- goto out;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
- BTREE_MAX_DEPTH, 0, iter_flags);
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_btree_increase_depth(trans, iter.path, 0) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-
- /* Must be checked with btree locked: */
- if (k->overwritten)
- goto out;
-
- if (k->k->k.type == KEY_TYPE_accounting) {
- struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto out;
-
- bkey_copy(n, k->k);
- goto out;
- }
-
- ret = bch2_trans_update(trans, &iter, k->k, update_flags);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int journal_sort_seq_cmp(const void *_l, const void *_r)
-{
- const struct journal_key *l = *((const struct journal_key **)_l);
- const struct journal_key *r = *((const struct journal_key **)_r);
-
- /*
- * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
- *
- * journal_seq == 0 means that the key comes from early repair, and
- * should be inserted last so as to avoid overflowing the journal
- */
- return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
-}
-
-int bch2_journal_replay(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- DARRAY(struct journal_key *) keys_sorted = { 0 };
- struct journal *j = &c->journal;
- u64 start_seq = c->journal_replay_seq_start;
- u64 end_seq = c->journal_replay_seq_start;
- struct btree_trans *trans = NULL;
- bool immediate_flush = false;
- int ret = 0;
-
- if (keys->nr) {
- ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
- keys->nr, start_seq, end_seq);
- if (ret)
- goto err;
- }
-
- BUG_ON(!atomic_read(&keys->ref));
-
- move_gap(keys, keys->nr);
- trans = bch2_trans_get(c);
-
- /*
- * Replay accounting keys first: we can't allow the write buffer to
- * flush accounting keys until we're done
- */
- darray_for_each(*keys, k) {
- if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
- continue;
-
- cond_resched();
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_skip_accounting_apply|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_WATERMARK_reclaim,
- bch2_journal_replay_accounting_key(trans, k));
- if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
- goto err;
-
- k->overwritten = true;
- }
-
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
- /*
- * First, attempt to replay keys in sorted order. This is more
- * efficient - better locality of btree access - but some might fail if
- * that would cause a journal deadlock.
- */
- darray_for_each(*keys, k) {
- cond_resched();
-
- /*
- * k->allocated means the key wasn't read in from the journal,
- * rather it was from early repair code
- */
- if (k->allocated)
- immediate_flush = true;
-
- /* Skip fastpath if we're low on space in the journal */
- ret = c->journal.watermark ? -1 :
- commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_journal_reclaim|
- BCH_TRANS_COMMIT_skip_accounting_apply|
- (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
- bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
- if (ret) {
- ret = darray_push(&keys_sorted, k);
- if (ret)
- goto err;
- }
- }
-
- bch2_trans_unlock_long(trans);
- /*
- * Now, replay any remaining keys in the order in which they appear in
- * the journal, unpinning those journal entries as we go:
- */
- sort_nonatomic(keys_sorted.data, keys_sorted.nr,
- sizeof(keys_sorted.data[0]),
- journal_sort_seq_cmp, NULL);
-
- darray_for_each(keys_sorted, kp) {
- cond_resched();
-
- struct journal_key *k = *kp;
-
- if (k->journal_seq)
- replay_now_at(j, k->journal_seq);
- else
- replay_now_at(j, j->replay_journal_seq_end);
-
- ret = commit_do(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_skip_accounting_apply|
- (!k->allocated
- ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
- : 0),
- bch2_journal_replay_key(trans, k));
- if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
- bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
- printbuf_exit(&buf);
- goto err;
- }
-
- BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
- }
-
- /*
- * We need to put our btree_trans before calling flush_all_pins(), since
- * that will use a btree_trans internally
- */
- bch2_trans_put(trans);
- trans = NULL;
-
- if (!c->opts.retain_recovery_info &&
- c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
- bch2_journal_keys_put_initial(c);
-
- replay_now_at(j, j->replay_journal_seq_end);
- j->replay_journal_seq = 0;
-
- bch2_journal_set_replay_done(j);
-
- /* if we did any repair, flush it immediately */
- if (immediate_flush) {
- bch2_journal_flush_all_pins(&c->journal);
- ret = bch2_journal_meta(&c->journal);
- }
-
- if (keys->nr)
- bch2_journal_log_msg(c, "journal replay finished");
-err:
- if (trans)
- bch2_trans_put(trans);
- darray_exit(&keys_sorted);
- bch_err_fn(c, ret);
- return ret;
-}
-
-/* journal replay early: */
-
-static int journal_replay_entry_early(struct bch_fs *c,
- struct jset_entry *entry)
-{
- int ret = 0;
-
- switch (entry->type) {
- case BCH_JSET_ENTRY_btree_root: {
-
- if (unlikely(!entry->u64s))
- return 0;
-
- if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
- c, invalid_btree_id,
- "invalid btree id %u (max %u)",
- entry->btree_id, BTREE_ID_NR_MAX))
- return 0;
-
- while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
- ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
- if (ret)
- return ret;
- }
-
- struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
-
- r->level = entry->level;
- bkey_copy(&r->key, (struct bkey_i *) entry->start);
- r->error = 0;
- r->alive = true;
- break;
- }
- case BCH_JSET_ENTRY_usage: {
- struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
-
- switch (entry->btree_id) {
- case BCH_FS_USAGE_key_version:
- atomic64_set(&c->key_version, le64_to_cpu(u->v));
- break;
- }
- break;
- }
- case BCH_JSET_ENTRY_blacklist: {
- struct jset_entry_blacklist *bl_entry =
- container_of(entry, struct jset_entry_blacklist, entry);
-
- ret = bch2_journal_seq_blacklist_add(c,
- le64_to_cpu(bl_entry->seq),
- le64_to_cpu(bl_entry->seq) + 1);
- break;
- }
- case BCH_JSET_ENTRY_blacklist_v2: {
- struct jset_entry_blacklist_v2 *bl_entry =
- container_of(entry, struct jset_entry_blacklist_v2, entry);
-
- ret = bch2_journal_seq_blacklist_add(c,
- le64_to_cpu(bl_entry->start),
- le64_to_cpu(bl_entry->end) + 1);
- break;
- }
- case BCH_JSET_ENTRY_clock: {
- struct jset_entry_clock *clock =
- container_of(entry, struct jset_entry_clock, entry);
-
- atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
- }
- }
-fsck_err:
- return ret;
-}
-
-static int journal_replay_early(struct bch_fs *c,
- struct bch_sb_field_clean *clean)
-{
- if (clean) {
- for (struct jset_entry *entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- int ret = journal_replay_entry_early(c, entry);
- if (ret)
- return ret;
- }
- } else {
- struct genradix_iter iter;
- struct journal_replay *i, **_i;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- vstruct_for_each(&i->j, entry) {
- int ret = journal_replay_entry_early(c, entry);
- if (ret)
- return ret;
- }
- }
- }
-
- return 0;
-}
-
-/* sb clean section: */
-
-static int read_btree_roots(struct bch_fs *c)
-{
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (!r->alive)
- continue;
-
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, i, r->level);
-
- if (mustfix_fsck_err_on((ret = r->error),
- c, btree_root_bkey_invalid,
- "invalid btree root %s",
- buf.buf) ||
- mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
- c, btree_root_read_error,
- "error reading btree root %s: %s",
- buf.buf, bch2_err_str(ret))) {
- if (btree_id_is_alloc(i))
- r->error = 0;
- ret = 0;
- }
- }
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
-
- if (!r->b && !r->error) {
- r->alive = false;
- r->level = 0;
- bch2_btree_root_alloc_fake(c, i, 0);
- }
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static bool check_version_upgrade(struct bch_fs *c)
-{
- unsigned latest_version = bcachefs_metadata_version_current;
- unsigned latest_compatible = min(latest_version,
- bch2_latest_compatible_version(c->sb.version));
- unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
- unsigned new_version = 0;
- bool ret = false;
-
- if (old_version < bcachefs_metadata_required_upgrade_below) {
- if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
- latest_compatible < bcachefs_metadata_required_upgrade_below)
- new_version = latest_version;
- else
- new_version = latest_compatible;
- } else {
- switch (c->opts.version_upgrade) {
- case BCH_VERSION_UPGRADE_compatible:
- new_version = latest_compatible;
- break;
- case BCH_VERSION_UPGRADE_incompatible:
- new_version = latest_version;
- break;
- case BCH_VERSION_UPGRADE_none:
- new_version = min(old_version, latest_version);
- break;
- }
- }
-
- if (new_version > old_version) {
- struct printbuf buf = PRINTBUF;
-
- if (old_version < bcachefs_metadata_required_upgrade_below)
- prt_str(&buf, "Version upgrade required:\n");
-
- if (old_version != c->sb.version) {
- prt_str(&buf, "Version upgrade from ");
- bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
- prt_str(&buf, " to ");
- bch2_version_to_text(&buf, c->sb.version);
- prt_str(&buf, " incomplete\n");
- }
-
- prt_printf(&buf, "Doing %s version upgrade from ",
- BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
- ? "incompatible" : "compatible");
- bch2_version_to_text(&buf, old_version);
- prt_str(&buf, " to ");
- bch2_version_to_text(&buf, new_version);
- prt_newline(&buf);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_upgrade(c, old_version, new_version);
- passes = ext->recovery_passes_required[0] & ~passes;
-
- if (passes) {
- prt_str(&buf, " running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
-
- bch_notice(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- ret = true;
- }
-
- if (new_version > c->sb.version_incompat_allowed &&
- c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Now allowing incompatible features up to ");
- bch2_version_to_text(&buf, new_version);
- prt_str(&buf, ", previously allowed up to ");
- bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
- prt_newline(&buf);
-
- bch_notice(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- ret = true;
- }
-
- if (ret)
- bch2_sb_upgrade(c, new_version,
- c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
-
- return ret;
-}
-
-int bch2_fs_recovery(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean = NULL;
- struct jset *last_journal_entry = NULL;
- u64 last_seq = 0, blacklist_seq, journal_seq;
- int ret = 0;
-
- if (c->sb.clean) {
- clean = bch2_read_superblock_clean(c);
- ret = PTR_ERR_OR_ZERO(clean);
- if (ret)
- goto err;
-
- bch_info(c, "recovering from clean shutdown, journal seq %llu",
- le64_to_cpu(clean->journal_seq));
- } else {
- bch_info(c, "recovering from unclean shutdown");
- }
-
- if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
- bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
- ret = -EINVAL;
- goto err;
- }
-
- if (!c->sb.clean &&
- !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
- bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
- ret = -EINVAL;
- goto err;
- }
-
- if (c->opts.norecovery) {
- c->opts.recovery_pass_last = c->opts.recovery_pass_last
- ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
- : BCH_RECOVERY_PASS_snapshots_read;
- c->opts.nochanges = true;
- }
-
- if (c->opts.nochanges)
- c->opts.read_only = true;
-
- if (c->opts.journal_rewind) {
- bch_info(c, "rewinding journal, fsck required");
- c->opts.fsck = true;
- }
-
- if (go_rw_in_recovery(c)) {
- /*
- * start workqueues/kworkers early - kthread creation checks for
- * pending signals, which is _very_ annoying
- */
- ret = bch2_fs_init_rw(c);
- if (ret)
- goto err;
- }
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- bool write_sb = false;
-
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
- write_sb = true;
- }
-
- u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- if (sb_passes) {
- struct printbuf buf = PRINTBUF;
- prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
- prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (bch2_check_version_downgrade(c)) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Version downgrade required:");
-
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&buf, "\n running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- write_sb = true;
- }
-
- if (check_version_upgrade(c))
- write_sb = true;
-
- c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
- if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
- SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
- write_sb = true;
- }
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (c->sb.clean)
- set_bit(BCH_FS_clean_recovery, &c->flags);
- if (c->opts.fsck)
- set_bit(BCH_FS_in_fsck, &c->flags);
- set_bit(BCH_FS_in_recovery, &c->flags);
-
- ret = bch2_blacklist_table_initialize(c);
- if (ret) {
- bch_err(c, "error initializing blacklist table");
- goto err;
- }
-
- bch2_journal_pos_from_member_info_resume(c);
-
- if (!c->sb.clean || c->opts.retain_recovery_info) {
- struct genradix_iter iter;
- struct journal_replay **i;
-
- bch_verbose(c, "starting journal read");
- ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
- if (ret)
- goto err;
-
- /*
- * note: cmd_list_journal needs the blacklist table fully up to date so
- * it can asterisk ignored journal entries:
- */
- if (c->opts.read_journal_only)
- goto out;
-
- genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (!journal_replay_ignore(*i)) {
- last_journal_entry = &(*i)->j;
- break;
- }
-
- if (mustfix_fsck_err_on(c->sb.clean &&
- last_journal_entry &&
- !journal_entry_empty(last_journal_entry), c,
- clean_but_journal_not_empty,
- "filesystem marked clean but journal not empty")) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- }
-
- if (!last_journal_entry) {
- fsck_err_on(!c->sb.clean, c,
- dirty_but_no_journal_entries,
- "no journal entries found");
- if (clean)
- goto use_clean;
-
- genradix_for_each_reverse(&c->journal_entries, iter, i)
- if (*i) {
- last_journal_entry = &(*i)->j;
- (*i)->ignore_blacklisted = false;
- (*i)->ignore_not_dirty= false;
- /*
- * This was probably a NO_FLUSH entry,
- * so last_seq was garbage - but we know
- * we're only using a single journal
- * entry, set it here:
- */
- (*i)->j.last_seq = (*i)->j.seq;
- break;
- }
- }
-
- ret = bch2_journal_keys_sort(c);
- if (ret)
- goto err;
-
- if (c->sb.clean && last_journal_entry) {
- ret = bch2_verify_superblock_clean(c, &clean,
- last_journal_entry);
- if (ret)
- goto err;
- }
- } else {
-use_clean:
- if (!clean) {
- bch_err(c, "no superblock clean section found");
- ret = bch_err_throw(c, fsck_repair_impossible);
- goto err;
-
- }
- blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
- }
-
- c->journal_replay_seq_start = last_seq;
- c->journal_replay_seq_end = blacklist_seq - 1;
-
- zero_out_btree_mem_ptr(&c->journal_keys);
-
- ret = journal_replay_early(c, clean);
- if (ret)
- goto err;
-
- ret = bch2_fs_resize_on_mount(c);
- if (ret) {
- up_write(&c->state_lock);
- goto err;
- }
-
- if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
- bch_info(c, "filesystem is an unresized image file, mounting ro");
- c->opts.read_only = true;
- }
-
- if (!c->opts.read_only &&
- (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
- bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
-
- bch2_reconstruct_alloc(c);
- } else if (c->opts.reconstruct_alloc) {
- bch2_journal_log_msg(c, "dropping alloc info");
- bch_info(c, "dropping and reconstructing all alloc info");
-
- bch2_reconstruct_alloc(c);
- }
-
- if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
- /* We can't go RW to fix errors without alloc info */
- if (c->opts.fix_errors == FSCK_FIX_yes ||
- c->opts.fix_errors == FSCK_FIX_ask)
- c->opts.fix_errors = FSCK_FIX_no;
- if (c->opts.errors == BCH_ON_ERROR_fix_safe)
- c->opts.errors = BCH_ON_ERROR_continue;
- }
-
- /*
- * After an unclean shutdown, skip then next few journal sequence
- * numbers as they may have been referenced by btree writes that
- * happened before their corresponding journal writes - those btree
- * writes need to be ignored, by skipping and blacklisting the next few
- * journal sequence numbers:
- */
- if (!c->sb.clean)
- journal_seq += JOURNAL_BUF_NR * 4;
-
- if (blacklist_seq != journal_seq) {
- ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
- blacklist_seq, journal_seq) ?:
- bch2_journal_seq_blacklist_add(c,
- blacklist_seq, journal_seq);
- if (ret) {
- bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
- goto err;
- }
- }
-
- ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
- journal_seq, last_seq, blacklist_seq - 1) ?:
- bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
- if (ret)
- goto err;
-
- /*
- * Skip past versions that might have possibly been used (as nonces),
- * but hadn't had their pointers written:
- */
- if (c->sb.encryption_type && !c->sb.clean)
- atomic64_add(1 << 16, &c->key_version);
-
- ret = read_btree_roots(c);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_btree_running, &c->flags);
-
- ret = bch2_sb_set_upgrade_extra(c);
- if (ret)
- goto err;
-
- ret = bch2_run_recovery_passes(c, 0);
- if (ret)
- goto err;
-
- /*
- * Normally set by the appropriate recovery pass: when cleared, this
- * indicates we're in early recovery and btree updates should be done by
- * being applied to the journal replay keys. _Must_ be cleared before
- * multithreaded use:
- */
- set_bit(BCH_FS_may_go_rw, &c->flags);
- clear_bit(BCH_FS_in_fsck, &c->flags);
-
- /* in case we don't run journal replay, i.e. norecovery mode */
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
-
- bch2_async_btree_node_rewrites_flush(c);
-
- /* fsync if we fixed errors */
- if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_meta(&c->journal);
- }
-
- /* If we fixed errors, verify that fs is actually clean now: */
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- test_bit(BCH_FS_errors_fixed, &c->flags) &&
- !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
- !test_bit(BCH_FS_error, &c->flags)) {
- bch2_flush_fsck_errs(c);
-
- bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
- clear_bit(BCH_FS_errors_fixed, &c->flags);
-
- ret = bch2_run_recovery_passes(c,
- BCH_RECOVERY_PASS_check_alloc_info);
- if (ret)
- goto err;
-
- if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
- test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
- bch_err(c, "Second fsck run was not clean");
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- }
-
- set_bit(BCH_FS_errors_fixed, &c->flags);
- }
-
- if (enabled_qtypes(c)) {
- bch_verbose(c, "reading quotas");
- ret = bch2_fs_quota_read(c);
- if (ret)
- goto err;
- bch_verbose(c, "quotas done");
- }
-
- mutex_lock(&c->sb_lock);
- ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- write_sb = false;
-
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
- write_sb = true;
- }
-
- if (!test_bit(BCH_FS_error, &c->flags) &&
- !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
- write_sb = true;
- }
-
- if (!test_bit(BCH_FS_error, &c->flags) &&
- !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
- memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
- write_sb = true;
- }
-
- if (c->opts.fsck &&
- !test_bit(BCH_FS_error, &c->flags) &&
- c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
- ext->btrees_lost_data) {
- ext->btrees_lost_data = 0;
- write_sb = true;
- }
-
- if (c->opts.fsck &&
- !test_bit(BCH_FS_error, &c->flags) &&
- !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
- SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
- SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
- write_sb = true;
- }
-
- if (bch2_blacklist_entries_gc(c))
- write_sb = true;
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
- c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
- struct bch_move_stats stats;
-
- bch2_move_stats_init(&stats, "recovery");
-
- struct printbuf buf = PRINTBUF;
- bch2_version_to_text(&buf, c->sb.version_min);
- bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
- printbuf_exit(&buf);
-
- ret = bch2_fs_read_write_early(c) ?:
- bch2_scan_old_btree_nodes(c, &stats);
- if (ret)
- goto err;
- bch_info(c, "scanning for old btree nodes done");
- }
-
- ret = 0;
-out:
- bch2_flush_fsck_errs(c);
-
- if (!ret &&
- test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
- !c->opts.nochanges) {
- bch2_fs_read_write_early(c);
- bch2_delete_dead_snapshots_async(c);
- }
-
- bch_err_fn(c, ret);
-final_out:
- if (!IS_ERR(clean))
- kfree(clean);
- return ret;
-err:
-fsck_err:
- {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
- bch2_fs_emergency_read_only2(c, &buf);
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
- goto final_out;
-}
-
-int bch2_fs_initialize(struct bch_fs *c)
-{
- struct bch_inode_unpacked root_inode, lostfound_inode;
- struct bkey_inode_buf packed_inode;
- struct qstr lostfound = QSTR("lost+found");
- struct bch_member *m;
- int ret;
-
- bch_notice(c, "initializing new filesystem");
- set_bit(BCH_FS_new_fs, &c->flags);
-
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
-
- bch2_check_version_downgrade(c);
-
- if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
- bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- bch2_write_super(c);
- }
-
- for_each_member_device(c, ca) {
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
- ca->mi = bch2_mi_to_cpu(m);
- }
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- set_bit(BCH_FS_btree_running, &c->flags);
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++)
- bch2_btree_root_alloc_fake(c, i, 0);
-
- ret = bch2_fs_journal_alloc(c);
- if (ret)
- goto err;
-
- /*
- * journal_res_get() will crash if called before this has
- * set up the journal.pin FIFO and journal.cur pointer:
- */
- ret = bch2_fs_journal_start(&c->journal, 1, 1);
- if (ret)
- goto err;
-
- ret = bch2_fs_read_write_early(c);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
- bch2_journal_set_replay_done(&c->journal);
-
- for_each_member_device(c, ca) {
- ret = bch2_dev_usage_init(ca, false);
- if (ret) {
- bch2_dev_put(ca);
- goto err;
- }
- }
-
- /*
- * Write out the superblock and journal buckets, now that we can do
- * btree updates
- */
- bch_verbose(c, "marking superblocks");
- ret = bch2_trans_mark_dev_sbs(c);
- bch_err_msg(c, ret, "marking superblocks");
- if (ret)
- goto err;
-
- ret = bch2_fs_freespace_init(c);
- if (ret)
- goto err;
-
- ret = bch2_initialize_subvolumes(c);
- if (ret)
- goto err;
-
- bch_verbose(c, "reading snapshots table");
- ret = bch2_snapshots_read(c);
- if (ret)
- goto err;
- bch_verbose(c, "reading snapshots done");
-
- bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
- root_inode.bi_inum = BCACHEFS_ROOT_INO;
- root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
- bch2_inode_pack(&packed_inode, &root_inode);
- packed_inode.inode.k.p.snapshot = U32_MAX;
-
- ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "creating root directory");
- if (ret)
- goto err;
-
- bch2_inode_init_early(c, &lostfound_inode);
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_create_trans(trans,
- BCACHEFS_ROOT_SUBVOL_INUM,
- &root_inode, &lostfound_inode,
- &lostfound,
- 0, 0, S_IFDIR|0700, 0,
- NULL, NULL, (subvol_inum) { 0 }, 0));
- bch_err_msg(c, ret, "creating lost+found");
- if (ret)
- goto err;
-
- c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
-
- bch2_copygc_wakeup(c);
- bch2_rebalance_wakeup(c);
-
- if (enabled_qtypes(c)) {
- ret = bch2_fs_quota_read(c);
- if (ret)
- goto err;
- }
-
- ret = bch2_journal_flush(&c->journal);
- bch_err_msg(c, ret, "writing first journal entry");
- if (ret)
- goto err;
-
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
- return 0;
-err:
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
deleted file mode 100644
index c023f52fc2d6..000000000000
--- a/fs/bcachefs/recovery.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_H
-#define _BCACHEFS_RECOVERY_H
-
-int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
-void bch2_reconstruct_alloc(struct bch_fs *);
-
-int bch2_journal_replay(struct bch_fs *);
-
-int bch2_fs_recovery(struct bch_fs *);
-int bch2_fs_initialize(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
deleted file mode 100644
index 6a039e011064..000000000000
--- a/fs/bcachefs/recovery_passes.c
+++ /dev/null
@@ -1,646 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "backpointers.h"
-#include "btree_gc.h"
-#include "btree_node_scan.h"
-#include "disk_accounting.h"
-#include "ec.h"
-#include "fsck.h"
-#include "inode.h"
-#include "journal.h"
-#include "lru.h"
-#include "logged_ops.h"
-#include "movinggc.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...) #_fn,
- BCH_RECOVERY_PASSES()
-#undef x
- NULL
-};
-
-static const u8 passes_to_stable_map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static const u8 passes_from_stable_map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
-{
- return passes_to_stable_map[pass];
-}
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(passes_to_stable_map[i]);
- return ret;
-}
-
-static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
-{
- return pass < ARRAY_SIZE(passes_from_stable_map)
- ? passes_from_stable_map[pass]
- : 0;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(passes_from_stable_map[i]);
- return ret;
-}
-
-static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- return 0;
-}
-
-static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_recovery_passes *r =
- field_to_type(f, recovery_passes);
- unsigned nr = recovery_passes_nr_entries(r);
-
- if (out->nr_tabstops < 1)
- printbuf_tabstop_push(out, 32);
- if (out->nr_tabstops < 2)
- printbuf_tabstop_push(out, 16);
-
- prt_printf(out, "Pass\tLast run\tLast runtime\n");
-
- for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
- if (!i->last_run)
- continue;
-
- unsigned idx = i - r->start;
-
- prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);
-
- bch2_prt_datetime(out, le64_to_cpu(i->last_run));
- prt_tab(out);
-
- bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
-
- if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
- prt_str(out, " (no ratelimit)");
-
- prt_newline(out);
- }
-}
-
-static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
-
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_recovery_passes *r =
- bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
-
- if (stable >= recovery_passes_nr_entries(r)) {
- unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);
-
- r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
- if (!r) {
- bch_err(c, "error creating recovery_passes sb section");
- return NULL;
- }
- }
-
- return r->start + stable;
-}
-
-static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
- enum bch_recovery_pass pass,
- s64 start_time)
-{
- guard(mutex)(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __clear_bit_le64(bch2_recovery_pass_to_stable(pass),
- ext->recovery_passes_required);
-
- struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
- if (e) {
- s64 end_time = ktime_get_real_seconds();
- e->last_run = cpu_to_le64(end_time);
- e->last_runtime = cpu_to_le32(max(0, end_time - start_time));
- SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
- }
-
- bch2_write_super(c);
-}
-
-void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- guard(mutex)(&c->sb_lock);
-
- struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
- if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
- SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
- bch2_write_super(c);
- }
-}
-
-static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
- bool ret = false;
-
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_recovery_passes *r =
- bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
-
- if (stable < recovery_passes_nr_entries(r)) {
- struct recovery_pass_entry *i = r->start + stable;
-
- /*
- * Ratelimit if the last runtime was more than 1% of the time
- * since we last ran
- */
- ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
- ktime_get_real_seconds() - le64_to_cpu(i->last_run);
-
- if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
- ret = false;
- }
-
- return ret;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
- .validate = bch2_sb_recovery_passes_validate,
- .to_text = bch2_sb_recovery_passes_to_text
-};
-
-/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
-static int bch2_recovery_pass_empty(struct bch_fs *c)
-{
- return 0;
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
-
- /*
- * After we go RW, the journal keys buffer can't be modified (except for
- * setting journal_key->overwritten: it will be accessed by multiple
- * threads
- */
- move_gap(keys, keys->nr);
-
- set_bit(BCH_FS_may_go_rw, &c->flags);
-
- if (go_rw_in_recovery(c)) {
- if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
- bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
- bch2_reconstruct_alloc(c);
- }
-
- return bch2_fs_read_write_early(c);
- }
- return 0;
-}
-
-/*
- * Make sure root inode is readable while we're still in recovery and can rewind
- * for repair:
- */
-static int bch2_lookup_root_inode(struct bch_fs *c)
-{
- subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
- struct bch_inode_unpacked inode_u;
- struct bch_subvolume subvol;
-
- return bch2_trans_do(c,
- bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
- bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-static u64 bch2_recovery_passes_match(unsigned flags)
-{
- u64 ret = 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & flags)
- ret |= BIT_ULL(i);
- return ret;
-}
-
-u64 bch2_fsck_recovery_passes(void)
-{
- return bch2_recovery_passes_match(PASS_FSCK);
-}
-
-static void bch2_run_async_recovery_passes(struct bch_fs *c)
-{
- if (!down_trylock(&c->recovery.run_lock))
- return;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
- goto unlock;
-
- if (queue_work(system_long_wq, &c->recovery.work))
- return;
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
-unlock:
- up(&c->recovery.run_lock);
-}
-
-static bool recovery_pass_needs_set(struct bch_fs *c,
- enum bch_recovery_pass pass,
- enum bch_run_recovery_pass_flags *flags)
-{
- struct bch_fs_recovery *r = &c->recovery;
-
- /*
- * Never run scan_for_btree_nodes persistently: check_topology will run
- * it if required
- */
- if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
- *flags |= RUN_RECOVERY_PASS_nopersistent;
-
- if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
- !bch2_recovery_pass_want_ratelimit(c, pass))
- *flags &= ~RUN_RECOVERY_PASS_ratelimit;
-
- /*
- * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
- * anything if the pass has already run: these mean we need a prior pass
- * to run before we continue to repair, we don't expect that pass to fix
- * the damage we encountered.
- *
- * Otherwise, we run run_explicit_recovery_pass when we find damage, so
- * it should run again even if it's already run:
- */
- bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
- bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
- bool rewind = in_recovery &&
- r->curr_pass > pass &&
- !(r->passes_complete & BIT_ULL(pass));
-
- if (persistent
- ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
- : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
- return true;
-
- if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
- (r->passes_ratelimiting & BIT_ULL(pass)))
- return true;
-
- if (rewind)
- return true;
-
- return false;
-}
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
- struct printbuf *out,
- enum bch_recovery_pass pass,
- enum bch_run_recovery_pass_flags flags)
-{
- struct bch_fs_recovery *r = &c->recovery;
- int ret = 0;
-
- lockdep_assert_held(&c->sb_lock);
-
- bch2_printbuf_make_room(out, 1024);
- out->atomic++;
-
- unsigned long lockflags;
- spin_lock_irqsave(&r->lock, lockflags);
-
- if (!recovery_pass_needs_set(c, pass, &flags))
- goto out;
-
- bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
- bool rewind = in_recovery &&
- r->curr_pass > pass &&
- !(r->passes_complete & BIT_ULL(pass));
- bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
-
- if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
- }
-
- if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
- (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
- prt_printf(out, "need recovery pass %s (%u), but already rw\n",
- bch2_recovery_passes[pass], pass);
- ret = bch_err_throw(c, cannot_rewind_recovery);
- goto out;
- }
-
- if (ratelimit)
- r->passes_ratelimiting |= BIT_ULL(pass);
- else
- r->passes_ratelimiting &= ~BIT_ULL(pass);
-
- if (in_recovery && !ratelimit) {
- prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[r->curr_pass], r->curr_pass,
- rewind ? " - rewinding" : "");
-
- r->passes_to_run |= BIT_ULL(pass);
-
- if (rewind) {
- r->next_pass = pass;
- r->passes_complete &= (1ULL << pass) >> 1;
- ret = bch_err_throw(c, restart_recovery);
- }
- } else {
- prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
- bch2_recovery_passes[pass], pass,
- ratelimit ? " - ratelimiting" : "");
-
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
- if (p->when & PASS_ONLINE)
- bch2_run_async_recovery_passes(c);
- }
-out:
- spin_unlock_irqrestore(&r->lock, lockflags);
- --out->atomic;
- return ret;
-}
-
-int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- struct printbuf *out,
- enum bch_recovery_pass pass,
- enum bch_run_recovery_pass_flags flags)
-{
- int ret = 0;
-
- if (recovery_pass_needs_set(c, pass, &flags)) {
- guard(mutex)(&c->sb_lock);
- ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
- bch2_write_super(c);
- }
-
- return ret;
-}
-
-/*
- * Returns 0 if @pass has run recently, otherwise one of
- * -BCH_ERR_restart_recovery
- * -BCH_ERR_recovery_pass_will_run
- */
-int bch2_require_recovery_pass(struct bch_fs *c,
- struct printbuf *out,
- enum bch_recovery_pass pass)
-{
- if (test_bit(BCH_FS_in_recovery, &c->flags) &&
- c->recovery.passes_complete & BIT_ULL(pass))
- return 0;
-
- guard(mutex)(&c->sb_lock);
-
- if (bch2_recovery_pass_want_ratelimit(c, pass))
- return 0;
-
- enum bch_run_recovery_pass_flags flags = 0;
- int ret = 0;
-
- if (recovery_pass_needs_set(c, pass, &flags)) {
- ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
- bch2_write_super(c);
- }
-
- return ret ?: bch_err_throw(c, recovery_pass_will_run);
-}
-
-int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- enum bch_run_recovery_pass_flags flags = 0;
-
- if (!recovery_pass_needs_set(c, pass, &flags))
- return 0;
-
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- mutex_lock(&c->sb_lock);
- int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
- RUN_RECOVERY_PASS_nopersistent);
- mutex_unlock(&c->sb_lock);
-
- bch2_print_str(c, KERN_NOTICE, buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct bch_fs_recovery *r = &c->recovery;
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
-
- s64 start_time = ktime_get_real_seconds();
- int ret = p->fn(c);
-
- r->passes_to_run &= ~BIT_ULL(pass);
-
- if (ret) {
- r->passes_failing |= BIT_ULL(pass);
- return ret;
- }
-
- r->passes_failing = 0;
-
- if (!test_bit(BCH_FS_error, &c->flags))
- bch2_sb_recovery_pass_complete(c, pass, start_time);
-
- if (!(p->when & PASS_SILENT))
- bch2_print(c, KERN_CONT " done\n");
-
- return 0;
-}
-
-static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
- bool online)
-{
- struct bch_fs_recovery *r = &c->recovery;
- int ret = 0;
-
- spin_lock_irq(&r->lock);
-
- if (online)
- orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);
-
- if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
- orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);
-
- /*
- * A failed recovery pass will be retried after another pass succeeds -
- * but not this iteration.
- *
- * This is because some passes depend on repair done by other passes: we
- * may want to retry, but we don't want to loop on failing passes.
- */
-
- orig_passes_to_run &= ~r->passes_failing;
-
- r->passes_to_run = orig_passes_to_run;
-
- while (r->passes_to_run) {
- unsigned prev_done = r->pass_done;
- unsigned pass = __ffs64(r->passes_to_run);
- r->curr_pass = pass;
- r->next_pass = r->curr_pass + 1;
- r->passes_to_run &= ~BIT_ULL(pass);
-
- spin_unlock_irq(&r->lock);
-
- int ret2 = bch2_run_recovery_pass(c, pass) ?:
- bch2_journal_flush(&c->journal);
-
- spin_lock_irq(&r->lock);
-
- if (r->next_pass < r->curr_pass) {
- /* Rewind: */
- r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
- } else if (!ret2) {
- r->pass_done = max(r->pass_done, pass);
- r->passes_complete |= BIT_ULL(pass);
- } else {
- ret = ret2;
- }
-
- if (ret && !online)
- break;
-
- if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
- r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
- bch2_copygc_wakeup(c);
- bch2_rebalance_wakeup(c);
- }
- }
-
- clear_bit(BCH_FS_in_recovery, &c->flags);
- spin_unlock_irq(&r->lock);
-
- return ret;
-}
-
-static void bch2_async_recovery_passes_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
- struct bch_fs_recovery *r = &c->recovery;
-
- __bch2_run_recovery_passes(c,
- c->sb.recovery_passes_required & ~r->passes_ratelimiting,
- true);
-
- up(&r->run_lock);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
-{
- return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
-}
-
-int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
-{
- u64 passes =
- bch2_recovery_passes_match(PASS_ALWAYS) |
- (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
- (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
- c->opts.recovery_passes |
- c->sb.recovery_passes_required;
-
- if (c->opts.recovery_pass_last)
- passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;
-
- /*
- * We can't allow set_may_go_rw to be excluded; that would cause us to
- * use the journal replay keys for updates where it's not expected.
- */
- c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
- passes &= ~c->opts.recovery_passes_exclude;
-
- passes &= ~(BIT_ULL(from) - 1);
-
- down(&c->recovery.run_lock);
- int ret = __bch2_run_recovery_passes(c, passes, false);
- up(&c->recovery.run_lock);
-
- return ret;
-}
-
-static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
-{
- prt_printf(out, "%s:\t", msg);
- prt_bitflags(out, bch2_recovery_passes, passes);
- prt_newline(out);
-}
-
-void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct bch_fs_recovery *r = &c->recovery;
-
- printbuf_tabstop_push(out, 32);
- prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
- prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
- bch2_recovery_passes_match(PASS_ONLINE));
- prt_passes(out, "Complete passes", r->passes_complete);
- prt_passes(out, "Failing passes", r->passes_failing);
-
- if (r->curr_pass) {
- prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
- prt_passes(out, "Current passes", r->passes_to_run);
- }
-}
-
-void bch2_fs_recovery_passes_init(struct bch_fs *c)
-{
- spin_lock_init(&c->recovery.lock);
- sema_init(&c->recovery.run_lock, 1);
-
- INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
-}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
deleted file mode 100644
index 2117f0ce1922..000000000000
--- a/fs/bcachefs/recovery_passes.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _BCACHEFS_RECOVERY_PASSES_H
-#define _BCACHEFS_RECOVERY_PASSES_H
-
-extern const char * const bch2_recovery_passes[];
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes;
-
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-u64 bch2_fsck_recovery_passes(void);
-
-void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
-
-enum bch_run_recovery_pass_flags {
- RUN_RECOVERY_PASS_nopersistent = BIT(0),
- RUN_RECOVERY_PASS_ratelimit = BIT(1),
-};
-
-static inline bool go_rw_in_recovery(struct bch_fs *c)
-{
- return (c->journal_keys.nr ||
- !c->opts.read_only ||
- !c->sb.clean ||
- c->opts.recovery_passes ||
- (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))));
-}
-
-int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
-
-int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
- enum bch_recovery_pass,
- enum bch_run_recovery_pass_flags);
-int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
- enum bch_recovery_pass,
- enum bch_run_recovery_pass_flags);
-
-int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
- enum bch_recovery_pass);
-
-int bch2_run_online_recovery_passes(struct bch_fs *, u64);
-int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
-
-void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *);
-
-void bch2_fs_recovery_passes_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
deleted file mode 100644
index b63c20558d3d..000000000000
--- a/fs/bcachefs/recovery_passes_format.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H
-#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H
-
-#define PASS_SILENT BIT(0)
-#define PASS_FSCK BIT(1)
-#define PASS_UNCLEAN BIT(2)
-#define PASS_ALWAYS BIT(3)
-#define PASS_ONLINE BIT(4)
-#define PASS_ALLOC BIT(5)
-#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define PASS_FSCK_DEBUG BIT(1)
-#else
-#define PASS_FSCK_DEBUG 0
-#endif
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES() \
- x(recovery_pass_empty, 41, PASS_SILENT) \
- x(scan_for_btree_nodes, 37, 0) \
- x(check_topology, 4, 0) \
- x(accounting_read, 39, PASS_ALWAYS) \
- x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, 0) \
- x(initialize_subvolumes, 2, 0) \
- x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_allocations, 5, PASS_FSCK_ALLOC) \
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
- x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
- x(bucket_gens_init, 17, 0) \
- x(reconstruct_snapshots, 38, 0) \
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
- x(fs_upgrade_for_subvolumes, 22, 0) \
- x(check_inodes, 24, PASS_FSCK) \
- x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
- x(check_dirents, 27, PASS_FSCK) \
- x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
- x(check_unreachable_inodes, 40, PASS_FSCK) \
- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
- x(check_nlinks, 31, PASS_FSCK) \
- x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_ALWAYS) \
- x(fix_reflink_p, 33, 0) \
- x(set_fs_needs_rebalance, 34, 0) \
- x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT)
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- BCH_RECOVERY_PASS_NR
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
-struct recovery_pass_entry {
- __le64 last_run;
- __le32 last_runtime;
- __le32 flags;
-};
-
-LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1)
-
-struct bch_sb_field_recovery_passes {
- struct bch_sb_field field;
- struct recovery_pass_entry start[];
-};
-
-static inline unsigned
-recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r)
-{
- return r
- ? ((vstruct_end(&r->field) - (void *) &r->start[0]) /
- sizeof(struct recovery_pass_entry))
- : 0;
-}
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
deleted file mode 100644
index aa9526938cc3..000000000000
--- a/fs/bcachefs/recovery_passes_types.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
-#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
-
-struct bch_fs_recovery {
- /*
- * Two different uses:
- * "Has this fsck pass?" - i.e. should this type of error be an
- * emergency read-only
- * And, in certain situations fsck will rewind to an earlier pass: used
- * for signaling to the toplevel code which pass we want to run now.
- */
- enum bch_recovery_pass curr_pass;
- enum bch_recovery_pass next_pass;
- /* never rewinds version of curr_pass */
- enum bch_recovery_pass pass_done;
- u64 passes_to_run;
- /* bitmask of recovery passes that we actually ran */
- u64 passes_complete;
- u64 passes_failing;
- u64 passes_ratelimiting;
- spinlock_t lock;
- struct semaphore run_lock;
- struct work_struct work;
-};
-
-#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
deleted file mode 100644
index 92b90cfe622b..000000000000
--- a/fs/bcachefs/reflink.c
+++ /dev/null
@@ -1,865 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "bkey_buf.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "io_misc.h"
-#include "io_write.h"
-#include "rebalance.h"
-#include "reflink.h"
-#include "subvolume.h"
-#include "super-io.h"
-
-#include <linux/sched/signal.h>
-
-static inline bool bkey_extent_is_reflink_data(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_reflink_v:
- case KEY_TYPE_indirect_inline_data:
- return true;
- default:
- return false;
- }
-}
-
-static inline unsigned bkey_type_to_indirect(const struct bkey *k)
-{
- switch (k->type) {
- case KEY_TYPE_extent:
- return KEY_TYPE_reflink_v;
- case KEY_TYPE_inline_data:
- return KEY_TYPE_indirect_inline_data;
- default:
- return 0;
- }
-}
-
-/* reflink pointers */
-
-int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- int ret = 0;
-
- bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad),
- c, reflink_p_front_pad_bad,
- "idx < front_pad (%llu < %u)",
- REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad));
-fsck_err:
- return ret;
-}
-
-void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
- prt_printf(out, "idx %llu front_pad %u back_pad %u",
- REFLINK_P_IDX(p.v),
- le32_to_cpu(p.v->front_pad),
- le32_to_cpu(p.v->back_pad));
-
- if (REFLINK_P_ERROR(p.v))
- prt_str(out, " error");
-}
-
-bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
- struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
-
- /*
- * Disabled for now, the triggers code needs to be reworked for merging
- * of reflink pointers to work:
- */
- return false;
-
- if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v))
- return false;
-
- if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v))
- return false;
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
- return true;
-}
-
-/* indirect extents */
-
-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)),
- c, reflink_v_pos_bad,
- "indirect extent above maximum position 0:%llu",
- REFLINK_P_IDX_MAX);
-
- ret = bch2_bkey_ptrs_validate(c, k, from);
-fsck_err:
- return ret;
-}
-
-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
-
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-#if 0
-Currently disabled, needs to be debugged:
-
-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
-{
- struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
-
- return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
-}
-#endif
-
-/* indirect inline data */
-
-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- return 0;
-}
-
-void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
- unsigned datalen = bkey_inline_data_bytes(k.k);
-
- prt_printf(out, "refcount %llu datalen %u: %*phN",
- le64_to_cpu(d.v->refcount), datalen,
- min(datalen, 32U), d.v->data);
-}
-
-/* lookup */
-
-static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p,
- bool should_commit)
-{
- struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- SET_REFLINK_P_ERROR(&new->v, false);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
- if (ret)
- return ret;
-
- if (!should_commit)
- return 0;
-
- return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
-}
-
-static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 missing_start, u64 missing_end,
- bool should_commit)
-{
- if (REFLINK_P_ERROR(p.v))
- return 0;
-
- struct bch_fs *c = trans->c;
- u64 live_start = REFLINK_P_IDX(p.v);
- u64 live_end = REFLINK_P_IDX(p.v) + p.k->size;
- u64 refd_start = live_start - le32_to_cpu(p.v->front_pad);
- u64 refd_end = live_end + le32_to_cpu(p.v->back_pad);
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(missing_start < refd_start);
- BUG_ON(missing_end > refd_end);
-
- struct bpos missing_pos = bkey_start_pos(p.k);
- missing_pos.offset += missing_start - live_start;
-
- prt_printf(&buf, "pointer to missing indirect extent in ");
- ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
- if (ret)
- goto err;
-
- prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9);
- bch2_bkey_val_to_text(&buf, c, p.s_c);
-
- prt_printf(&buf, "\nmissing reflink btree range %llu-%llu",
- missing_start, missing_end);
-
- if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
- struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- /*
- * Is the missing range not actually needed?
- *
- * p.v->idx refers to the data that we actually want, but if the
- * indirect extent we point to was bigger, front_pad and back_pad
- * indicate the range we took a reference on.
- */
-
- if (missing_end <= live_start) {
- new->v.front_pad = cpu_to_le32(live_start - missing_end);
- } else if (missing_start >= live_end) {
- new->v.back_pad = cpu_to_le32(missing_start - live_end);
- } else {
- struct bpos new_start = bkey_start_pos(&new->k);
- struct bpos new_end = new->k.p;
-
- if (missing_start > live_start)
- new_start.offset += missing_start - live_start;
- if (missing_end < live_end)
- new_end.offset -= live_end - missing_end;
-
- bch2_cut_front(new_start, &new->k_i);
- bch2_cut_back(new_end, &new->k_i);
-
- SET_REFLINK_P_ERROR(&new->v, true);
- }
-
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun);
- if (ret)
- goto err;
-
- if (should_commit)
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * This is used from the read path, which doesn't expect to have to do a
- * transaction commit, and from triggers, which should not be doing a commit:
- */
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- s64 *offset_into_extent,
- struct bkey_s_c_reflink_p p,
- bool should_commit,
- unsigned iter_flags)
-{
- BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad)));
- BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad));
-
- u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent;
-
- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink,
- POS(0, reflink_offset), iter_flags);
- if (bkey_err(k))
- return k;
-
- if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
- u64 missing_end = min(k.k->p.offset,
- REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad));
- BUG_ON(reflink_offset == missing_end);
-
- int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
- missing_end, should_commit);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return bkey_s_c_err(ret);
- }
- } else if (unlikely(REFLINK_P_ERROR(p.v))) {
- int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return bkey_s_c_err(ret);
- }
- }
-
- *offset_into_extent = reflink_offset - bkey_start_offset(k.k);
- return k;
-}
-
-/* reflink pointer trigger */
-
-static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p, u64 *idx,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v);
- struct btree_iter iter;
- struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false,
- BTREE_ITER_intent|
- BTREE_ITER_with_updates);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_refcount_c(k)) {
- if (!(flags & BTREE_TRIGGER_overwrite))
- ret = bch_err_throw(c, missing_indirect_extent);
- goto next;
- }
-
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- __le64 *refcount = bkey_refcount(bkey_i_to_s(new));
- if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- log_fsck_err(trans, reflink_refcount_underflow,
- "indirect extent refcount underflow while marking\n%s",
- buf.buf);
- goto next;
- }
-
- if (flags & BTREE_TRIGGER_insert) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- u64 pad;
-
- pad = max_t(s64, le32_to_cpu(v->front_pad),
- REFLINK_P_IDX(v) - bkey_start_offset(&new->k));
- BUG_ON(pad > U32_MAX);
- v->front_pad = cpu_to_le32(pad);
-
- pad = max_t(s64, le32_to_cpu(v->back_pad),
- new->k.p.offset - p.k->size - REFLINK_P_IDX(v));
- BUG_ON(pad > U32_MAX);
- v->back_pad = cpu_to_le32(pad);
- }
-
- le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1);
-
- bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, new, 0);
- if (ret)
- goto err;
-next:
- *idx = k.k->p.offset;
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p, u64 *idx,
- enum btree_iter_update_trigger_flags flags,
- size_t r_idx)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1;
- u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
- s64 ret = 0;
- struct printbuf buf = PRINTBUF;
-
- if (r_idx >= c->reflink_gc_nr)
- goto not_found;
-
- r = genradix_ptr(&c->reflink_gc_table, r_idx);
- next_idx = min(next_idx, r->offset - r->size);
- if (*idx < next_idx)
- goto not_found;
-
- BUG_ON((s64) r->refcount + add < 0);
-
- if (flags & BTREE_TRIGGER_gc)
- r->refcount += add;
- *idx = r->offset;
- return 0;
-not_found:
- if (flags & BTREE_TRIGGER_check_repair) {
- ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
- if (ret)
- goto err;
- }
-
- *idx = next_idx;
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- int ret = 0;
-
- u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
- u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
-
- if (flags & BTREE_TRIGGER_transactional) {
- while (idx < end && !ret)
- ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
- }
-
- if (flags & (BTREE_TRIGGER_check_repair|BTREE_TRIGGER_gc)) {
- size_t l = 0, r = c->reflink_gc_nr;
-
- while (l < r) {
- size_t m = l + (r - l) / 2;
- struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
- if (ref->offset <= idx)
- l = m + 1;
- else
- r = m;
- }
-
- while (idx < end && !ret)
- ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
- }
-
- return ret;
-}
-
-int bch2_trigger_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- if ((flags & BTREE_TRIGGER_transactional) &&
- (flags & BTREE_TRIGGER_insert)) {
- struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
-
- v->front_pad = v->back_pad = 0;
- }
-
- return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
-/* indirect extent trigger */
-
-static inline void
-check_indirect_extent_deleting(struct bkey_s new,
- enum btree_iter_update_trigger_flags *flags)
-{
- if ((*flags & BTREE_TRIGGER_insert) && !*bkey_refcount(new)) {
- new.k->type = KEY_TYPE_deleted;
- new.k->size = 0;
- set_bkey_val_u64s(new.k, 0);
- *flags &= ~BTREE_TRIGGER_insert;
- }
-}
-
-int bch2_trigger_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- if ((flags & BTREE_TRIGGER_transactional) &&
- (flags & BTREE_TRIGGER_insert))
- check_indirect_extent_deleting(new, &flags);
-
- return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
-}
-
-int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- check_indirect_extent_deleting(new, &flags);
-
- return 0;
-}
-
-/* create */
-
-static int bch2_make_extent_indirect(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *orig,
- bool reflink_p_may_update_opts_field)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter reflink_iter = {};
- struct bkey_s_c k;
- struct bkey_i *r_v;
- struct bkey_i_reflink_p *r_p;
- __le64 *refcount;
- int ret;
-
- if (orig->k.type == KEY_TYPE_inline_data)
- bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
-
- bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- /*
- * XXX: we're assuming that 56 bits will be enough for the life of the
- * filesystem: we need to implement wraparound, with a cursor in the
- * logged ops btree:
- */
- if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size)))
- return -ENOSPC;
-
- r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
- ret = PTR_ERR_OR_ZERO(r_v);
- if (ret)
- goto err;
-
- bkey_init(&r_v->k);
- r_v->k.type = bkey_type_to_indirect(&orig->k);
- r_v->k.p = reflink_iter.pos;
- bch2_key_resize(&r_v->k, orig->k.size);
- r_v->k.bversion = orig->k.bversion;
-
- set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
-
- refcount = bkey_refcount(bkey_i_to_s(r_v));
- *refcount = 0;
- memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
-
- ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
- if (ret)
- goto err;
-
- /*
- * orig is in a bkey_buf which statically allocates 5 64s for the val,
- * so we know it will be big enough:
- */
- orig->k.type = KEY_TYPE_reflink_p;
- r_p = bkey_i_to_reflink_p(orig);
- set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
-
- /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
-#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
- __underlying_memset(&r_p->v, 0, sizeof(r_p->v));
-#else
- memset(&r_p->v, 0, sizeof(r_p->v));
-#endif
-
- SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k));
-
- if (reflink_p_may_update_opts_field)
- SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true);
-
- ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
- BTREE_UPDATE_internal_snapshot_node);
-err:
- bch2_trans_iter_exit(trans, &reflink_iter);
-
- return ret;
-}
-
-static struct bkey_s_c get_next_src(struct btree_trans *trans,
- struct btree_iter *iter, struct bpos end)
-{
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
- if (bkey_extent_is_unwritten(k))
- continue;
-
- if (bkey_extent_is_data(k.k))
- return k;
- }
-
- if (bkey_ge(iter->pos, end))
- bch2_btree_iter_set_pos(trans, iter, end);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-}
-
-s64 bch2_remap_range(struct bch_fs *c,
- subvol_inum dst_inum, u64 dst_offset,
- subvol_inum src_inum, u64 src_offset,
- u64 remap_sectors,
- u64 new_i_size, s64 *i_sectors_delta,
- bool may_change_src_io_path_opts)
-{
- struct btree_trans *trans;
- struct btree_iter dst_iter, src_iter;
- struct bkey_s_c src_k;
- struct bkey_buf new_dst, new_src;
- struct bpos dst_start = POS(dst_inum.inum, dst_offset);
- struct bpos src_start = POS(src_inum.inum, src_offset);
- struct bpos dst_end = dst_start, src_end = src_start;
- struct bch_io_opts opts;
- struct bpos src_want;
- u64 dst_done = 0;
- u32 dst_snapshot, src_snapshot;
- bool reflink_p_may_update_opts_field =
- !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
- int ret = 0, ret2 = 0;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
- return bch_err_throw(c, erofs_no_writes);
-
- bch2_check_set_feature(c, BCH_FEATURE_reflink);
-
- dst_end.offset += remap_sectors;
- src_end.offset += remap_sectors;
-
- bch2_bkey_buf_init(&new_dst);
- bch2_bkey_buf_init(&new_src);
- trans = bch2_trans_get(c);
-
- ret = bch2_inum_opts_get(trans, src_inum, &opts);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
- BTREE_ITER_intent);
- bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
- BTREE_ITER_intent);
-
- while ((ret == 0 ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
- bkey_lt(dst_iter.pos, dst_end)) {
- struct disk_reservation disk_res = { 0 };
-
- bch2_trans_begin(trans);
-
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
- &src_snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
-
- ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
- &dst_snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
-
- if (dst_inum.inum < src_inum.inum) {
- /* Avoid some lock cycle transaction restarts */
- ret = bch2_btree_iter_traverse(trans, &dst_iter);
- if (ret)
- continue;
- }
-
- dst_done = dst_iter.pos.offset - dst_start.offset;
- src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(trans, &src_iter, src_want);
-
- src_k = get_next_src(trans, &src_iter, src_end);
- ret = bkey_err(src_k);
- if (ret)
- continue;
-
- if (bkey_lt(src_want, src_iter.pos)) {
- ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
- min(dst_end.offset,
- dst_iter.pos.offset +
- src_iter.pos.offset - src_want.offset),
- i_sectors_delta);
- continue;
- }
-
- if (src_k.k->type != KEY_TYPE_reflink_p) {
- bch2_btree_iter_set_pos_to_extent_start(&src_iter);
-
- bch2_bkey_buf_reassemble(&new_src, c, src_k);
- src_k = bkey_i_to_s_c(new_src.k);
-
- ret = bch2_make_extent_indirect(trans, &src_iter,
- new_src.k,
- reflink_p_may_update_opts_field);
- if (ret)
- continue;
-
- BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
- }
-
- if (src_k.k->type == KEY_TYPE_reflink_p) {
- struct bkey_s_c_reflink_p src_p =
- bkey_s_c_to_reflink_p(src_k);
- struct bkey_i_reflink_p *dst_p =
- bkey_reflink_p_init(new_dst.k);
-
- u64 offset = REFLINK_P_IDX(src_p.v) +
- (src_want.offset -
- bkey_start_offset(src_k.k));
-
- SET_REFLINK_P_IDX(&dst_p->v, offset);
-
- if (reflink_p_may_update_opts_field &&
- may_change_src_io_path_opts &&
- REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
- SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
- } else {
- BUG();
- }
-
- new_dst.k->k.p = dst_iter.pos;
- bch2_key_resize(&new_dst.k->k,
- min(src_k.k->p.offset - src_want.offset,
- dst_end.offset - dst_iter.pos.offset));
-
- ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
- bch2_extent_update(trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
- bch2_disk_reservation_put(c, &disk_res);
- }
- bch2_trans_iter_exit(trans, &dst_iter);
- bch2_trans_iter_exit(trans, &src_iter);
-
- BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
- BUG_ON(bkey_gt(dst_iter.pos, dst_end));
-
- dst_done = dst_iter.pos.offset - dst_start.offset;
- new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
-
- do {
- struct bch_inode_unpacked inode_u;
- struct btree_iter inode_iter = {};
-
- bch2_trans_begin(trans);
-
- ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
- dst_inum, BTREE_ITER_intent);
-
- if (!ret2 &&
- inode_u.bi_size < new_i_size) {
- inode_u.bi_size = new_i_size;
- ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc);
- }
-
- bch2_trans_iter_exit(trans, &inode_iter);
- } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-err:
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&new_src, c);
- bch2_bkey_buf_exit(&new_dst, c);
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink);
-
- return dst_done ?: ret ?: ret2;
-}
-
-/* fsck */
-
-static int bch2_gc_write_reflink_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- size_t *idx)
-{
- struct bch_fs *c = trans->c;
- const __le64 *refcount = bkey_refcount_c(k);
- struct printbuf buf = PRINTBUF;
- struct reflink_gc *r;
- int ret = 0;
-
- if (!refcount)
- return 0;
-
- while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
- r->offset < k.k->p.offset)
- ++*idx;
-
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
- }
-
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
- trans, reflink_v_refcount_wrong,
- "reflink key has wrong refcount:\n"
- "%s\n"
- "should be %u",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
- r->refcount)) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- if (!r->refcount)
- new->k.type = KEY_TYPE_deleted;
- else
- *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
- ret = bch2_trans_update(trans, iter, new, 0);
- }
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_gc_reflink_done(struct bch_fs *c)
-{
- size_t idx = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
- c->reflink_gc_nr = 0;
- return ret;
-}
-
-int bch2_gc_reflink_start(struct bch_fs *c)
-{
- c->reflink_gc_nr = 0;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- continue;
-
- struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
- c->reflink_gc_nr++, GFP_KERNEL);
- if (!r) {
- ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
- break;
- }
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- 0;
- })));
-
- bch_err_fn(c, ret);
- return ret;
-}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
deleted file mode 100644
index 1632780bdf18..000000000000
--- a/fs/bcachefs/reflink.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_H
-#define _BCACHEFS_REFLINK_H
-
-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
- .key_validate = bch2_reflink_p_validate, \
- .val_to_text = bch2_reflink_p_to_text, \
- .key_merge = bch2_reflink_p_merge, \
- .trigger = bch2_trigger_reflink_p, \
- .min_val_size = 16, \
-})
-
-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
- .key_validate = bch2_reflink_v_validate, \
- .val_to_text = bch2_reflink_v_to_text, \
- .swab = bch2_ptr_swab, \
- .trigger = bch2_trigger_reflink_v, \
- .min_val_size = 8, \
-})
-
-int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_indirect_inline_data_to_text(struct printbuf *,
- struct bch_fs *, struct bkey_s_c);
-int bch2_trigger_indirect_inline_data(struct btree_trans *,
- enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
- .key_validate = bch2_indirect_inline_data_validate, \
- .val_to_text = bch2_indirect_inline_data_to_text, \
- .trigger = bch2_trigger_indirect_inline_data, \
- .min_val_size = 8, \
-})
-
-static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_reflink_v:
- return &bkey_s_c_to_reflink_v(k).v->refcount;
- case KEY_TYPE_indirect_inline_data:
- return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
- default:
- return NULL;
- }
-}
-
-static inline __le64 *bkey_refcount(struct bkey_s k)
-{
- switch (k.k->type) {
- case KEY_TYPE_reflink_v:
- return &bkey_s_to_reflink_v(k).v->refcount;
- case KEY_TYPE_indirect_inline_data:
- return &bkey_s_to_indirect_inline_data(k).v->refcount;
- default:
- return NULL;
- }
-}
-
-struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *,
- s64 *, struct bkey_s_c_reflink_p,
- bool, unsigned);
-
-s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
- subvol_inum, u64, u64, u64, s64 *,
- bool);
-
-int bch2_gc_reflink_done(struct bch_fs *);
-int bch2_gc_reflink_start(struct bch_fs *);
-
-#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
deleted file mode 100644
index 92995e4f898e..000000000000
--- a/fs/bcachefs/reflink_format.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REFLINK_FORMAT_H
-#define _BCACHEFS_REFLINK_FORMAT_H
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx_flags;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56);
-LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57);
-LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS,
- struct bch_reflink_p, idx_flags, 57, 58);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
deleted file mode 100644
index 8383bd7fdb3f..000000000000
--- a/fs/bcachefs/replicas.c
+++ /dev/null
@@ -1,918 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "buckets.h"
-#include "disk_accounting.h"
-#include "journal.h"
-#include "replicas.h"
-#include "super-io.h"
-
-#include <linux/sort.h>
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
- struct bch_replicas_cpu *);
-
-/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, const void *priv)
-{
- size_t size = (size_t) priv;
- return memcmp(l, r, size);
-}
-
-/* Replicas tracking - in memory: */
-
-static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- BUG_ON(!e->nr_devs);
- BUG_ON(e->nr_required > 1 &&
- e->nr_required >= e->nr_devs);
-
- for (unsigned i = 0; i + 1 < e->nr_devs; i++)
- BUG_ON(e->devs[i] >= e->devs[i + 1]);
-#endif
-}
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
-{
- bubble_sort(e->devs, e->nr_devs, u8_cmp);
-}
-
-static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
-{
- eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
- bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
-}
-
-static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
- struct bch_replicas_entry_v0 *e)
-{
- bch2_prt_data_type(out, e->data_type);
-
- prt_printf(out, ": %u [", e->nr_devs);
- for (unsigned i = 0; i < e->nr_devs; i++)
- prt_printf(out, i ? " %u" : "%u", e->devs[i]);
- prt_printf(out, "]");
-}
-
-void bch2_replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry_v1 *e)
-{
- bch2_prt_data_type(out, e->data_type);
-
- prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (unsigned i = 0; i < e->nr_devs; i++)
- prt_printf(out, i ? " %u" : "%u", e->devs[i]);
- prt_printf(out, "]");
-}
-
-static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
-{
- if (!r->nr_devs) {
- prt_printf(err, "no devices in entry ");
- goto bad;
- }
-
- if (r->nr_required > 1 &&
- r->nr_required >= r->nr_devs) {
- prt_printf(err, "bad nr_required in entry ");
- goto bad;
- }
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
- !bch2_member_exists(sb, r->devs[i])) {
- prt_printf(err, "invalid device %u in entry ", r->devs[i]);
- goto bad;
- }
-
- return 0;
-bad:
- bch2_replicas_entry_to_text(err, r);
- return -BCH_ERR_invalid_replicas_entry;
-}
-
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
- struct bch_fs *c,
- struct printbuf *err)
-{
- if (!r->nr_devs) {
- prt_printf(err, "no devices in entry ");
- goto bad;
- }
-
- if (r->nr_required > 1 &&
- r->nr_required >= r->nr_devs) {
- prt_printf(err, "bad nr_required in entry ");
- goto bad;
- }
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
- !bch2_dev_exists(c, r->devs[i])) {
- prt_printf(err, "invalid device %u in entry ", r->devs[i]);
- goto bad;
- }
-
- return 0;
-bad:
- bch2_replicas_entry_to_text(err, r);
- return bch_err_throw(c, invalid_replicas_entry);
-}
-
-void bch2_cpu_replicas_to_text(struct printbuf *out,
- struct bch_replicas_cpu *r)
-{
- struct bch_replicas_entry_v1 *e;
- bool first = true;
-
- for_each_cpu_replicas_entry(r, e) {
- if (!first)
- prt_printf(out, " ");
- first = false;
-
- bch2_replicas_entry_to_text(out, e);
- }
-}
-
-static void extent_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry_v1 *r)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- r->nr_required = 1;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.cached)
- continue;
-
- if (!p.has_ec)
- replicas_entry_add_dev(r, p.ptr.dev);
- else
- r->nr_required = 0;
- }
-}
-
-static void stripe_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry_v1 *r)
-{
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- const struct bch_extent_ptr *ptr;
-
- r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
-
- for (ptr = s.v->ptrs;
- ptr < s.v->ptrs + s.v->nr_blocks;
- ptr++)
- replicas_entry_add_dev(r, ptr->dev);
-}
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
- struct bkey_s_c k)
-{
- e->nr_devs = 0;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- e->data_type = BCH_DATA_btree;
- extent_to_replicas(k, e);
- break;
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- e->data_type = BCH_DATA_user;
- extent_to_replicas(k, e);
- break;
- case KEY_TYPE_stripe:
- e->data_type = BCH_DATA_parity;
- stripe_to_replicas(k, e);
- break;
- }
-
- bch2_replicas_entry_sort(e);
-}
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
-{
- BUG_ON(!data_type ||
- data_type == BCH_DATA_sb ||
- data_type >= BCH_DATA_NR);
-
- e->data_type = data_type;
- e->nr_devs = 0;
- e->nr_required = 1;
-
- darray_for_each(devs, i)
- replicas_entry_add_dev(e, *i);
-
- bch2_replicas_entry_sort(e);
-}
-
-static struct bch_replicas_cpu
-cpu_replicas_add_entry(struct bch_fs *c,
- struct bch_replicas_cpu *old,
- struct bch_replicas_entry_v1 *new_entry)
-{
- struct bch_replicas_cpu new = {
- .nr = old->nr + 1,
- .entry_size = max_t(unsigned, old->entry_size,
- replicas_entry_bytes(new_entry)),
- };
-
- new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
- if (!new.entries)
- return new;
-
- for (unsigned i = 0; i < old->nr; i++)
- memcpy(cpu_replicas_entry(&new, i),
- cpu_replicas_entry(old, i),
- old->entry_size);
-
- memcpy(cpu_replicas_entry(&new, old->nr),
- new_entry,
- replicas_entry_bytes(new_entry));
-
- bch2_cpu_replicas_sort(&new);
- return new;
-}
-
-static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
- struct bch_replicas_entry_v1 *search)
-{
- int idx, entry_size = replicas_entry_bytes(search);
-
- if (unlikely(entry_size > r->entry_size))
- return -1;
-
-#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
- idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
- entry_cmp, search);
-#undef entry_cmp
-
- return idx < r->nr ? idx : -1;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *c,
- struct bch_replicas_entry_v1 *search)
-{
- bch2_replicas_entry_sort(search);
-
- return __replicas_entry_idx(&c->replicas, search);
-}
-
-static bool __replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_entry_v1 *search)
-{
- return __replicas_entry_idx(r, search) >= 0;
-}
-
-bool bch2_replicas_marked_locked(struct bch_fs *c,
- struct bch_replicas_entry_v1 *search)
-{
- verify_replicas_entry(search);
-
- return !search->nr_devs ||
- (__replicas_has_entry(&c->replicas, search) &&
- (likely((!c->replicas_gc.entries)) ||
- __replicas_has_entry(&c->replicas_gc, search)));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry_v1 *search)
-{
- percpu_down_read(&c->mark_lock);
- bool ret = bch2_replicas_marked_locked(c, search);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-noinline
-static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_entry_v1 *new_entry)
-{
- struct bch_replicas_cpu new_r, new_gc;
- int ret = 0;
-
- verify_replicas_entry(new_entry);
-
- memset(&new_r, 0, sizeof(new_r));
- memset(&new_gc, 0, sizeof(new_gc));
-
- mutex_lock(&c->sb_lock);
-
- if (c->replicas_gc.entries &&
- !__replicas_has_entry(&c->replicas_gc, new_entry)) {
- new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
- if (!new_gc.entries) {
- ret = bch_err_throw(c, ENOMEM_cpu_replicas);
- goto err;
- }
- }
-
- if (!__replicas_has_entry(&c->replicas, new_entry)) {
- new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
- if (!new_r.entries) {
- ret = bch_err_throw(c, ENOMEM_cpu_replicas);
- goto err;
- }
-
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
- if (ret)
- goto err;
- }
-
- if (!new_r.entries &&
- !new_gc.entries)
- goto out;
-
- /* allocations done, now commit: */
-
- if (new_r.entries)
- bch2_write_super(c);
-
- /* don't update in memory replicas until changes are persistent */
- percpu_down_write(&c->mark_lock);
- if (new_r.entries)
- swap(c->replicas, new_r);
- if (new_gc.entries)
- swap(new_gc, c->replicas_gc);
- percpu_up_write(&c->mark_lock);
-out:
- mutex_unlock(&c->sb_lock);
-
- kfree(new_r.entries);
- kfree(new_gc.entries);
-
- return ret;
-err:
- bch_err_msg(c, ret, "adding replicas entry");
- goto out;
-}
-
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
-{
- return likely(bch2_replicas_marked(c, r))
- ? 0 : bch2_mark_replicas_slowpath(c, r);
-}
-
-/*
- * Old replicas_gc mechanism: only used for journal replicas entries now, should
- * die at some point:
- */
-
-int bch2_replicas_gc_end(struct bch_fs *c, int ret)
-{
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- percpu_down_write(&c->mark_lock);
-
- ret = ret ?:
- bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
- if (!ret)
- swap(c->replicas, c->replicas_gc);
-
- kfree(c->replicas_gc.entries);
- c->replicas_gc.entries = NULL;
-
- percpu_up_write(&c->mark_lock);
-
- if (!ret)
- bch2_write_super(c);
-
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
- struct bch_replicas_entry_v1 *e;
- unsigned i = 0;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- BUG_ON(c->replicas_gc.entries);
-
- c->replicas_gc.nr = 0;
- c->replicas_gc.entry_size = 0;
-
- for_each_cpu_replicas_entry(&c->replicas, e) {
- /* Preserve unknown data types */
- if (e->data_type >= BCH_DATA_NR ||
- !((1 << e->data_type) & typemask)) {
- c->replicas_gc.nr++;
- c->replicas_gc.entry_size =
- max_t(unsigned, c->replicas_gc.entry_size,
- replicas_entry_bytes(e));
- }
- }
-
- c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
- c->replicas_gc.entry_size,
- GFP_KERNEL);
- if (!c->replicas_gc.entries) {
- mutex_unlock(&c->sb_lock);
- bch_err(c, "error allocating c->replicas_gc");
- return bch_err_throw(c, ENOMEM_replicas_gc);
- }
-
- for_each_cpu_replicas_entry(&c->replicas, e)
- if (e->data_type >= BCH_DATA_NR ||
- !((1 << e->data_type) & typemask))
- memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
- e, c->replicas_gc.entry_size);
-
- bch2_cpu_replicas_sort(&c->replicas_gc);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-/*
- * New much simpler mechanism for clearing out unneeded replicas entries - drop
- * replicas entries that have 0 sectors used.
- *
- * However, we don't track sector counts for journal usage, so this doesn't drop
- * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
- * is retained for that.
- */
-int bch2_replicas_gc2(struct bch_fs *c)
-{
- struct bch_replicas_cpu new = { 0 };
- unsigned nr;
- int ret = 0;
-
- bch2_accounting_mem_gc(c);
-retry:
- nr = READ_ONCE(c->replicas.nr);
- new.entry_size = READ_ONCE(c->replicas.entry_size);
- new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
- if (!new.entries) {
- bch_err(c, "error allocating c->replicas_gc");
- return bch_err_throw(c, ENOMEM_replicas_gc);
- }
-
- mutex_lock(&c->sb_lock);
- percpu_down_write(&c->mark_lock);
-
- if (nr != c->replicas.nr ||
- new.entry_size != c->replicas.entry_size) {
- percpu_up_write(&c->mark_lock);
- mutex_unlock(&c->sb_lock);
- kfree(new.entries);
- goto retry;
- }
-
- for (unsigned i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- struct disk_accounting_pos k = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
-
- unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
- "embedded variable length struct");
-
- struct bpos p = disk_accounting_pos_to_bpos(&k);
-
- struct bch_accounting_mem *acc = &c->accounting;
- bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &p) >= acc->k.nr;
-
- if (e->data_type == BCH_DATA_journal || !kill)
- memcpy(cpu_replicas_entry(&new, new.nr++),
- e, new.entry_size);
- }
-
- bch2_cpu_replicas_sort(&new);
-
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
-
- if (!ret)
- swap(c->replicas, new);
-
- kfree(new.entries);
-
- percpu_up_write(&c->mark_lock);
-
- if (!ret)
- bch2_write_super(c);
-
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-/* Replicas tracking - superblock: */
-
-static int
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
- struct bch_replicas_cpu *cpu_r)
-{
- struct bch_replicas_entry_v1 *e, *dst;
- unsigned nr = 0, entry_size = 0, idx = 0;
-
- for_each_replicas_entry(sb_r, e) {
- entry_size = max_t(unsigned, entry_size,
- replicas_entry_bytes(e));
- nr++;
- }
-
- cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
- if (!cpu_r->entries)
- return -BCH_ERR_ENOMEM_cpu_replicas;
-
- cpu_r->nr = nr;
- cpu_r->entry_size = entry_size;
-
- for_each_replicas_entry(sb_r, e) {
- dst = cpu_replicas_entry(cpu_r, idx++);
- memcpy(dst, e, replicas_entry_bytes(e));
- bch2_replicas_entry_sort(dst);
- }
-
- return 0;
-}
-
-static int
-__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
- struct bch_replicas_cpu *cpu_r)
-{
- struct bch_replicas_entry_v0 *e;
- unsigned nr = 0, entry_size = 0, idx = 0;
-
- for_each_replicas_entry(sb_r, e) {
- entry_size = max_t(unsigned, entry_size,
- replicas_entry_bytes(e));
- nr++;
- }
-
- entry_size += sizeof(struct bch_replicas_entry_v1) -
- sizeof(struct bch_replicas_entry_v0);
-
- cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
- if (!cpu_r->entries)
- return -BCH_ERR_ENOMEM_cpu_replicas;
-
- cpu_r->nr = nr;
- cpu_r->entry_size = entry_size;
-
- for_each_replicas_entry(sb_r, e) {
- struct bch_replicas_entry_v1 *dst =
- cpu_replicas_entry(cpu_r, idx++);
-
- dst->data_type = e->data_type;
- dst->nr_devs = e->nr_devs;
- dst->nr_required = 1;
- memcpy(dst->devs, e->devs, e->nr_devs);
- bch2_replicas_entry_sort(dst);
- }
-
- return 0;
-}
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
-{
- struct bch_sb_field_replicas *sb_v1;
- struct bch_sb_field_replicas_v0 *sb_v0;
- struct bch_replicas_cpu new_r = { 0, 0, NULL };
- int ret = 0;
-
- if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
- ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
- else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
- ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
- if (ret)
- return ret;
-
- bch2_cpu_replicas_sort(&new_r);
-
- percpu_down_write(&c->mark_lock);
- swap(c->replicas, new_r);
- percpu_up_write(&c->mark_lock);
-
- kfree(new_r.entries);
-
- return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_sb_field_replicas_v0 *sb_r;
- struct bch_replicas_entry_v0 *dst;
- struct bch_replicas_entry_v1 *src;
- size_t bytes;
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for_each_cpu_replicas_entry(r, src)
- bytes += replicas_entry_bytes(src) - 1;
-
- sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
- DIV_ROUND_UP(bytes, sizeof(u64)));
- if (!sb_r)
- return bch_err_throw(c, ENOSPC_sb_replicas);
-
- bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
- sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- dst = sb_r->entries;
- for_each_cpu_replicas_entry(r, src) {
- dst->data_type = src->data_type;
- dst->nr_devs = src->nr_devs;
- memcpy(dst->devs, src->devs, src->nr_devs);
-
- dst = replicas_entry_next(dst);
-
- BUG_ON((void *) dst > vstruct_end(&sb_r->field));
- }
-
- return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry_v1 *dst, *src;
- bool need_v1 = false;
- size_t bytes;
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for_each_cpu_replicas_entry(r, src) {
- bytes += replicas_entry_bytes(src);
- if (src->nr_required != 1)
- need_v1 = true;
- }
-
- if (!need_v1)
- return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
-
- sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
- DIV_ROUND_UP(bytes, sizeof(u64)));
- if (!sb_r)
- return bch_err_throw(c, ENOSPC_sb_replicas);
-
- bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
- sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- dst = sb_r->entries;
- for_each_cpu_replicas_entry(r, src) {
- memcpy(dst, src, replicas_entry_bytes(src));
-
- dst = replicas_entry_next(dst);
-
- BUG_ON((void *) dst > vstruct_end(&sb_r->field));
- }
-
- return 0;
-}
-
-static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
- struct bch_sb *sb,
- struct printbuf *err)
-{
- unsigned i;
-
- sort_r(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL,
- (void *)(size_t)cpu_r->entry_size);
-
- for (i = 0; i < cpu_r->nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(cpu_r, i);
-
- int ret = bch2_replicas_entry_sb_validate(e, sb, err);
- if (ret)
- return ret;
-
- if (i + 1 < cpu_r->nr) {
- struct bch_replicas_entry_v1 *n =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
-
- if (!memcmp(e, n, cpu_r->entry_size)) {
- prt_printf(err, "duplicate replicas entry ");
- bch2_replicas_entry_to_text(err, e);
- return -BCH_ERR_invalid_sb_replicas;
- }
- }
- }
-
- return 0;
-}
-
-static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
- struct bch_replicas_cpu cpu_r;
- int ret;
-
- ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
- if (ret)
- return ret;
-
- ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
- kfree(cpu_r.entries);
- return ret;
-}
-
-static void bch2_sb_replicas_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas *r = field_to_type(f, replicas);
- struct bch_replicas_entry_v1 *e;
- bool first = true;
-
- for_each_replicas_entry(r, e) {
- if (!first)
- prt_printf(out, " ");
- first = false;
-
- bch2_replicas_entry_to_text(out, e);
- }
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
- .validate = bch2_sb_replicas_validate,
- .to_text = bch2_sb_replicas_to_text,
-};
-
-static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
- struct bch_replicas_cpu cpu_r;
- int ret;
-
- ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
- if (ret)
- return ret;
-
- ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
- kfree(cpu_r.entries);
- return ret;
-}
-
-static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
- struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
- struct bch_replicas_entry_v0 *e;
- bool first = true;
-
- for_each_replicas_entry(sb_r, e) {
- if (!first)
- prt_printf(out, " ");
- first = false;
-
- bch2_replicas_entry_v0_to_text(out, e);
- }
- prt_newline(out);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
- .validate = bch2_sb_replicas_v0_validate,
- .to_text = bch2_sb_replicas_v0_to_text,
-};
-
-/* Query replicas: */
-
-bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
- unsigned flags, bool print)
-{
- struct bch_replicas_entry_v1 *e;
- bool ret = true;
-
- percpu_down_read(&c->mark_lock);
- for_each_cpu_replicas_entry(&c->replicas, e) {
- unsigned nr_online = 0, nr_failed = 0, dflags = 0;
- bool metadata = e->data_type < BCH_DATA_user;
-
- if (e->data_type == BCH_DATA_cached)
- continue;
-
- scoped_guard(rcu)
- for (unsigned i = 0; i < e->nr_devs; i++) {
- if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
- nr_failed++;
- continue;
- }
-
- nr_online += test_bit(e->devs[i], devs.d);
-
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
- nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
- }
-
- if (nr_online + nr_failed == e->nr_devs)
- continue;
-
- if (nr_online < e->nr_required)
- dflags |= metadata
- ? BCH_FORCE_IF_METADATA_LOST
- : BCH_FORCE_IF_DATA_LOST;
-
- if (nr_online < e->nr_devs)
- dflags |= metadata
- ? BCH_FORCE_IF_METADATA_DEGRADED
- : BCH_FORCE_IF_DATA_DEGRADED;
-
- if (dflags & ~flags) {
- if (print) {
- struct printbuf buf = PRINTBUF;
-
- bch2_replicas_entry_to_text(&buf, e);
- bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf.buf);
- printbuf_exit(&buf);
- }
- ret = false;
- break;
- }
-
- }
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
-{
- struct bch_sb_field_replicas *replicas;
- struct bch_sb_field_replicas_v0 *replicas_v0;
- unsigned data_has = 0;
-
- replicas = bch2_sb_field_get(sb, replicas);
- replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
-
- if (replicas) {
- struct bch_replicas_entry_v1 *r;
-
- for_each_replicas_entry(replicas, r) {
- if (r->data_type >= sizeof(data_has) * 8)
- continue;
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] == dev)
- data_has |= 1 << r->data_type;
- }
-
- } else if (replicas_v0) {
- struct bch_replicas_entry_v0 *r;
-
- for_each_replicas_entry_v0(replicas_v0, r) {
- if (r->data_type >= sizeof(data_has) * 8)
- continue;
-
- for (unsigned i = 0; i < r->nr_devs; i++)
- if (r->devs[i] == dev)
- data_has |= 1 << r->data_type;
- }
- }
-
-
- return data_has;
-}
-
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
- mutex_lock(&c->sb_lock);
- unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-void bch2_fs_replicas_exit(struct bch_fs *c)
-{
- kfree(c->replicas.entries);
- kfree(c->replicas_gc.entries);
-}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
deleted file mode 100644
index 5aba2c1ce133..000000000000
--- a/fs/bcachefs/replicas.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_H
-#define _BCACHEFS_REPLICAS_H
-
-#include "bkey.h"
-#include "eytzinger.h"
-#include "replicas_types.h"
-
-void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
-void bch2_replicas_entry_to_text(struct printbuf *,
- struct bch_replicas_entry_v1 *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
- struct bch_fs *, struct printbuf *);
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-
-static inline struct bch_replicas_entry_v1 *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
- return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *,
- struct bch_replicas_entry_v1 *);
-
-void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
- enum bch_data_type,
- struct bch_devs_list);
-
-bool bch2_replicas_marked_locked(struct bch_fs *,
- struct bch_replicas_entry_v1 *);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
-int bch2_mark_replicas(struct bch_fs *,
- struct bch_replicas_entry_v1 *);
-
-void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
- unsigned dev)
-{
- e->data_type = BCH_DATA_cached;
- e->nr_devs = 1;
- e->nr_required = 1;
- e->devs[0] = dev;
-}
-
-bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
- unsigned, bool);
-
-unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-int bch2_replicas_gc2(struct bch_fs *);
-
-#define for_each_cpu_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
- _i = (void *) (_i) + (_r)->entry_size)
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-#define replicas_entry_next(_i) \
- ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
-
-#define for_each_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
-#define for_each_replicas_entry_v0(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
-int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
-extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
-
-void bch2_fs_replicas_exit(struct bch_fs *);
-
-#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
deleted file mode 100644
index b7eff904acdb..000000000000
--- a/fs/bcachefs/replicas_format.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_FORMAT_H
-#define _BCACHEFS_REPLICAS_FORMAT_H
-
-struct bch_replicas_entry_v0 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas_v0 {
- struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[];
-} __packed __aligned(8);
-
-struct bch_replicas_entry_v1 {
- __u8 data_type;
- __u8 nr_devs;
- __u8 nr_required;
- __u8 devs[] __counted_by(nr_devs);
-} __packed;
-
-struct bch_sb_field_replicas {
- struct bch_sb_field field;
- struct bch_replicas_entry_v1 entries[];
-} __packed __aligned(8);
-
-#define replicas_entry_bytes(_i) \
- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
-#define replicas_entry_add_dev(e, d) ({ \
- (e)->nr_devs++; \
- (e)->devs[(e)->nr_devs - 1] = (d); \
-})
-
-#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
deleted file mode 100644
index fed71c861fe7..000000000000
--- a/fs/bcachefs/replicas_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_REPLICAS_TYPES_H
-#define _BCACHEFS_REPLICAS_TYPES_H
-
-struct bch_replicas_cpu {
- unsigned nr;
- unsigned entry_size;
- struct bch_replicas_entry_v1 *entries;
-};
-
-#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
deleted file mode 100644
index 59c8770e4a0e..000000000000
--- a/fs/bcachefs/sb-clean.c
+++ /dev/null
@@ -1,340 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
-#include "error.h"
-#include "journal_io.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "super-io.h"
-
-/*
- * BCH_SB_FIELD_clean:
- *
- * Btree roots, and a few other things, are recovered from the journal after an
- * unclean shutdown - but after a clean shutdown, to avoid having to read the
- * journal, we can store them in the superblock.
- *
- * bch_sb_field_clean simply contains a list of journal entries, stored exactly
- * as they would be in the journal:
- */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
- int write)
-{
- struct bkey_validate_context from = {
- .flags = write,
- .from = BKEY_VALIDATE_superblock,
- };
- struct jset_entry *entry;
- int ret;
-
- for (entry = clean->start;
- entry < (struct jset_entry *) vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if (vstruct_end(entry) > vstruct_end(&clean->field)) {
- bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
- le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
- (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
- bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
- return -BCH_ERR_fsck_repair_unimplemented;
- }
-
- ret = bch2_journal_entry_validate(c, NULL, entry,
- le16_to_cpu(c->disk_sb.sb->version),
- BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
- from);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static struct bkey_i *btree_root_find(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct jset *j,
- enum btree_id id, unsigned *level)
-{
- struct bkey_i *k;
- struct jset_entry *entry, *start, *end;
-
- if (clean) {
- start = clean->start;
- end = vstruct_end(&clean->field);
- } else {
- start = j->start;
- end = vstruct_last(j);
- }
-
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root &&
- entry->btree_id == id)
- goto found;
-
- return NULL;
-found:
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
-
- k = entry->start;
- *level = entry->level;
- return k;
-}
-
-int bch2_verify_superblock_clean(struct bch_fs *c,
- struct bch_sb_field_clean **cleanp,
- struct jset *j)
-{
- unsigned i;
- struct bch_sb_field_clean *clean = *cleanp;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- int ret = 0;
-
- if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
- sb_clean_journal_seq_mismatch,
- "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
- le64_to_cpu(clean->journal_seq),
- le64_to_cpu(j->seq))) {
- kfree(clean);
- *cleanp = NULL;
- return 0;
- }
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct bkey_i *k1, *k2;
- unsigned l1 = 0, l2 = 0;
-
- k1 = btree_root_find(c, clean, NULL, i, &l1);
- k2 = btree_root_find(c, NULL, j, i, &l2);
-
- if (!k1 && !k2)
- continue;
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- if (k1)
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
- else
- prt_printf(&buf1, "(none)");
-
- if (k2)
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
- else
- prt_printf(&buf2, "(none)");
-
- mustfix_fsck_err_on(!k1 || !k2 ||
- IS_ERR(k1) ||
- IS_ERR(k2) ||
- k1->k.u64s != k2->k.u64s ||
- memcmp(k1, k2, bkey_bytes(&k1->k)) ||
- l1 != l2, c,
- sb_clean_btree_root_mismatch,
- "superblock btree root %u doesn't match journal after clean shutdown\n"
- "sb: l=%u %s\n"
- "journal: l=%u %s\n", i,
- l1, buf1.buf,
- l2, buf2.buf);
- }
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean, *sb_clean;
- int ret;
-
- mutex_lock(&c->sb_lock);
- sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
-
- if (fsck_err_on(!sb_clean, c,
- sb_clean_missing,
- "superblock marked clean but clean section not present")) {
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-BCH_ERR_invalid_sb_clean);
- }
-
- clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
- GFP_KERNEL);
- if (!clean) {
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
- }
-
- ret = bch2_sb_clean_validate_late(c, clean, READ);
- if (ret) {
- kfree(clean);
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
- }
-
- mutex_unlock(&c->sb_lock);
-
- return clean;
-fsck_err:
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry **end,
- u64 journal_seq)
-{
- {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_key_version;
- u->v = cpu_to_le64(atomic64_read(&c->key_version));
- }
-
- for (unsigned i = 0; i < 2; i++) {
- struct jset_entry_clock *clock =
- container_of(jset_entry_init(end, sizeof(*clock)),
- struct jset_entry_clock, entry);
-
- clock->entry.type = BCH_JSET_ENTRY_clock;
- clock->rw = i;
- clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
- }
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
- if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&clean->field), sizeof(*clean));
- return -BCH_ERR_invalid_sb_clean;
- }
-
- for (struct jset_entry *entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if ((void *) vstruct_next(entry) > vstruct_end(&clean->field)) {
- prt_str(err, "entry type ");
- bch2_prt_jset_entry_type(err, entry->type);
- prt_str(err, " overruns end of section");
- return -BCH_ERR_invalid_sb_clean;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
- struct jset_entry *entry;
-
- prt_printf(out, "flags: %x\n", le32_to_cpu(clean->flags));
- prt_printf(out, "journal_seq: %llu\n", le64_to_cpu(clean->journal_seq));
-
- for (entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if ((void *) vstruct_next(entry) > vstruct_end(&clean->field))
- break;
-
- if (entry->type == BCH_JSET_ENTRY_btree_keys &&
- !entry->u64s)
- continue;
-
- bch2_journal_entry_to_text(out, NULL, entry);
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_clean = {
- .validate = bch2_sb_clean_validate,
- .to_text = bch2_sb_clean_to_text,
-};
-
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
- int ret;
-
- /*
- * Unconditionally write superblock, to verify it hasn't changed before
- * we go rw:
- */
-
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
- ret = bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *sb_clean;
- struct jset_entry *entry;
- unsigned u64s;
- int ret;
-
- mutex_lock(&c->sb_lock);
- if (BCH_SB_CLEAN(c->disk_sb.sb))
- goto out;
-
- SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
- u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
- sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
- if (!sb_clean) {
- bch_err(c, "error resizing superblock while setting filesystem clean");
- goto out;
- }
-
- sb_clean->flags = 0;
- sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
-
- /* Trying to catch outstanding bug: */
- BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
- entry = sb_clean->start;
- bch2_journal_super_entries_add_common(c, &entry, 0);
- entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
- BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
- memset(entry, 0,
- vstruct_end(&sb_clean->field) - (void *) entry);
-
- /*
- * this should be in the write path, and we should be validating every
- * superblock section:
- */
- ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
- if (ret) {
- bch_err(c, "error writing marking filesystem clean: validate error");
- goto out;
- }
-
- bch2_journal_pos_from_member_info_set(c);
-
- bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
deleted file mode 100644
index 71caef281239..000000000000
--- a/fs/bcachefs/sb-clean.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_CLEAN_H
-#define _BCACHEFS_SB_CLEAN_H
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
- struct jset *);
-struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
-void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
-
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
deleted file mode 100644
index 2b4b8445d418..000000000000
--- a/fs/bcachefs/sb-counters.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "super-io.h"
-#include "sb-counters.h"
-
-/* BCH_SB_FIELD_counters */
-
-static const u8 counters_to_stable_map[] = {
-#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
- BCH_PERSISTENT_COUNTERS()
-#undef x
-};
-
-const char * const bch2_counter_names[] = {
-#define x(t, n, ...) (#t),
- BCH_PERSISTENT_COUNTERS()
-#undef x
- NULL
-};
-
-static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
-{
- if (!ctrs)
- return 0;
-
- return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
-}
-
-static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- return 0;
-}
-
-static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
- unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
- if (stable < nr)
- prt_printf(out, "%s \t%llu\n",
- bch2_counter_names[i],
- le64_to_cpu(ctrs->d[stable]));
- }
-}
-
-int bch2_sb_counters_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
- unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
- c->counters_on_mount[i] = 0;
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
- if (stable < nr) {
- u64 v = le64_to_cpu(ctrs->d[stable]);
- percpu_u64_set(&c->counters[i], v);
- c->counters_on_mount[i] = v;
- }
- }
-
- return 0;
-}
-
-int bch2_sb_counters_from_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
- struct bch_sb_field_counters *ret;
- unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
-
- if (nr < BCH_COUNTER_NR) {
- ret = bch2_sb_field_resize(&c->disk_sb, counters,
- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
- if (ret) {
- ctrs = ret;
- nr = bch2_sb_counter_nr_entries(ctrs);
- }
- }
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
- if (stable < nr)
- ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
- }
-
- return 0;
-}
-
-void bch2_fs_counters_exit(struct bch_fs *c)
-{
- free_percpu(c->counters);
-}
-
-int bch2_fs_counters_init(struct bch_fs *c)
-{
- c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
- if (!c->counters)
- return -BCH_ERR_ENOMEM_fs_counters_init;
-
- return bch2_sb_counters_to_cpu(c);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_counters = {
- .validate = bch2_sb_counters_validate,
- .to_text = bch2_sb_counters_to_text,
-};
-
-#ifndef NO_BCACHEFS_CHARDEV
-long bch2_ioctl_query_counters(struct bch_fs *c,
- struct bch_ioctl_query_counters __user *user_arg)
-{
- struct bch_ioctl_query_counters arg;
- int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
- if (ret)
- return ret;
-
- if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
- arg.pad)
- return -EINVAL;
-
- arg.nr = min(arg.nr, BCH_COUNTER_NR);
- ret = put_user(arg.nr, &user_arg->nr);
- if (ret)
- return ret;
-
- for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
- unsigned stable = counters_to_stable_map[i];
-
- if (stable < arg.nr) {
- u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
- ? percpu_u64_get(&c->counters[i])
- : c->counters_on_mount[i];
-
- ret = put_user(v, &user_arg->d[stable]);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-#endif
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
deleted file mode 100644
index a4329ad8dd1b..000000000000
--- a/fs/bcachefs/sb-counters.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_H
-#define _BCACHEFS_SB_COUNTERS_H
-
-#include "bcachefs.h"
-#include "super-io.h"
-
-int bch2_sb_counters_to_cpu(struct bch_fs *);
-int bch2_sb_counters_from_cpu(struct bch_fs *);
-
-void bch2_fs_counters_exit(struct bch_fs *);
-int bch2_fs_counters_init(struct bch_fs *);
-
-extern const char * const bch2_counter_names[];
-extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-
-long bch2_ioctl_query_counters(struct bch_fs *,
- struct bch_ioctl_query_counters __user *);
-
-#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
deleted file mode 100644
index b868702a431a..000000000000
--- a/fs/bcachefs/sb-counters_format.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
-#define _BCACHEFS_SB_COUNTERS_FORMAT_H
-
-enum counters_flags {
- TYPE_COUNTER = BIT(0), /* event counters */
- TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */
-};
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0, TYPE_SECTORS) \
- x(io_read_inline, 80, TYPE_SECTORS) \
- x(io_read_hole, 81, TYPE_SECTORS) \
- x(io_read_promote, 30, TYPE_COUNTER) \
- x(io_read_bounce, 31, TYPE_COUNTER) \
- x(io_read_split, 33, TYPE_COUNTER) \
- x(io_read_reuse_race, 34, TYPE_COUNTER) \
- x(io_read_retry, 32, TYPE_COUNTER) \
- x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
- x(io_write, 1, TYPE_SECTORS) \
- x(io_move, 2, TYPE_SECTORS) \
- x(io_move_read, 35, TYPE_SECTORS) \
- x(io_move_write, 36, TYPE_SECTORS) \
- x(io_move_finish, 37, TYPE_SECTORS) \
- x(io_move_fail, 38, TYPE_COUNTER) \
- x(io_move_write_fail, 82, TYPE_COUNTER) \
- x(io_move_start_fail, 39, TYPE_COUNTER) \
- x(io_move_created_rebalance, 83, TYPE_COUNTER) \
- x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \
- x(bucket_invalidate, 3, TYPE_COUNTER) \
- x(bucket_discard, 4, TYPE_COUNTER) \
- x(bucket_discard_fast, 79, TYPE_COUNTER) \
- x(bucket_alloc, 5, TYPE_COUNTER) \
- x(bucket_alloc_fail, 6, TYPE_COUNTER) \
- x(btree_cache_scan, 7, TYPE_COUNTER) \
- x(btree_cache_reap, 8, TYPE_COUNTER) \
- x(btree_cache_cannibalize, 9, TYPE_COUNTER) \
- x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \
- x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \
- x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \
- x(btree_node_write, 13, TYPE_COUNTER) \
- x(btree_node_read, 14, TYPE_COUNTER) \
- x(btree_node_compact, 15, TYPE_COUNTER) \
- x(btree_node_merge, 16, TYPE_COUNTER) \
- x(btree_node_split, 17, TYPE_COUNTER) \
- x(btree_node_rewrite, 18, TYPE_COUNTER) \
- x(btree_node_alloc, 19, TYPE_COUNTER) \
- x(btree_node_free, 20, TYPE_COUNTER) \
- x(btree_node_set_root, 21, TYPE_COUNTER) \
- x(btree_path_relock_fail, 22, TYPE_COUNTER) \
- x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \
- x(btree_reserve_get_fail, 24, TYPE_COUNTER) \
- x(journal_entry_full, 25, TYPE_COUNTER) \
- x(journal_full, 26, TYPE_COUNTER) \
- x(journal_reclaim_finish, 27, TYPE_COUNTER) \
- x(journal_reclaim_start, 28, TYPE_COUNTER) \
- x(journal_write, 29, TYPE_COUNTER) \
- x(copygc, 40, TYPE_COUNTER) \
- x(copygc_wait, 41, TYPE_COUNTER) \
- x(gc_gens_end, 42, TYPE_COUNTER) \
- x(gc_gens_start, 43, TYPE_COUNTER) \
- x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \
- x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \
- x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \
- x(trans_restart_fault_inject, 47, TYPE_COUNTER) \
- x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \
- x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \
- x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \
- x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \
- x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \
- x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \
- x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \
- x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \
- x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \
- x(trans_restart_relock, 57, TYPE_COUNTER) \
- x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \
- x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \
- x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \
- x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \
- x(trans_restart_relock_path, 62, TYPE_COUNTER) \
- x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \
- x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \
- x(trans_restart_traverse, 65, TYPE_COUNTER) \
- x(trans_restart_upgrade, 66, TYPE_COUNTER) \
- x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \
- x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \
- x(trans_restart_injected, 69, TYPE_COUNTER) \
- x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \
- x(trans_traverse_all, 71, TYPE_COUNTER) \
- x(transaction_commit, 72, TYPE_COUNTER) \
- x(write_super, 73, TYPE_COUNTER) \
- x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \
- x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
- x(trans_restart_split_race, 76, TYPE_COUNTER) \
- x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
- x(write_buffer_flush_sync, 78, TYPE_COUNTER)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-enum bch_persistent_counters_stable {
-#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_STABLE_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
-#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
deleted file mode 100644
index 1506d05e0665..000000000000
--- a/fs/bcachefs/sb-downgrade.c
+++ /dev/null
@@ -1,457 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Superblock section that contains a list of recovery passes to run when
- * downgrading past a given version
- */
-
-#include "bcachefs.h"
-#include "darray.h"
-#include "recovery_passes.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
-
-/*
- * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
- *
- * x(version, recovery_passes, errors...)
- */
-#define UPGRADE_TABLE() \
- x(snapshot_2, \
- RECOVERY_PASS_ALL_FSCK, \
- BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \
- BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \
- x(backpointers, \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v3, \
- RECOVERY_PASS_ALL_FSCK) \
- x(unwritten_extents, \
- RECOVERY_PASS_ALL_FSCK) \
- x(bucket_gens, \
- BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(lru_v2, \
- RECOVERY_PASS_ALL_FSCK) \
- x(fragmentation_lru, \
- RECOVERY_PASS_ALL_FSCK) \
- x(no_bps_in_alloc_keys, \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_trees, \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_skiplists, \
- BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \
- BCH_FSCK_ERR_snapshot_bad_depth, \
- BCH_FSCK_ERR_snapshot_bad_skiplist) \
- x(deleted_inodes, \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
- x(rebalance_work, \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
- x(subvolume_fs_parent, \
- BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
- BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
- x(btree_subvolume_children, \
- BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
- BCH_FSCK_ERR_subvol_children_not_set) \
- x(mi_btree_bitmap, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_btree_bitmap_not_marked) \
- x(disk_accounting_v2, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_bkey_version_in_future, \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(disk_accounting_v3, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_bkey_version_in_future, \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \
- BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
- BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(disk_accounting_inum, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(rebalance_work_acct_fix, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch) \
- x(inode_has_child_snapshots, \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \
- x(backpointer_bucket_gen, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_backpointer_to_missing_ptr, \
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(disk_accounting_big_endian, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(cached_backpointers, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(stripe_backpointers, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(inode_has_case_insensitive, \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \
- BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
-
-#define DOWNGRADE_TABLE() \
- x(bucket_stripe_sectors, \
- 0) \
- x(disk_accounting_v2, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_fs_usage_hidden_wrong, \
- BCH_FSCK_ERR_fs_usage_btree_wrong, \
- BCH_FSCK_ERR_fs_usage_data_wrong, \
- BCH_FSCK_ERR_fs_usage_cached_wrong, \
- BCH_FSCK_ERR_fs_usage_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
- BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_replicas_wrong, \
- BCH_FSCK_ERR_bkey_version_in_future) \
- x(disk_accounting_v3, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_dev_usage_buckets_wrong, \
- BCH_FSCK_ERR_dev_usage_sectors_wrong, \
- BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
- BCH_FSCK_ERR_fs_usage_hidden_wrong, \
- BCH_FSCK_ERR_fs_usage_btree_wrong, \
- BCH_FSCK_ERR_fs_usage_data_wrong, \
- BCH_FSCK_ERR_fs_usage_cached_wrong, \
- BCH_FSCK_ERR_fs_usage_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
- BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
- BCH_FSCK_ERR_fs_usage_replicas_wrong, \
- BCH_FSCK_ERR_accounting_replicas_not_marked, \
- BCH_FSCK_ERR_bkey_version_in_future) \
- x(rebalance_work_acct_fix, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(backpointer_bucket_gen, \
- BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \
- BCH_FSCK_ERR_backpointer_to_missing_ptr, \
- BCH_FSCK_ERR_ptr_to_missing_backpointer) \
- x(disk_accounting_big_endian, \
- BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch, \
- BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end)
-
-struct upgrade_downgrade_entry {
- u64 recovery_passes;
- u16 version;
- u16 nr_errors;
- const u16 *errors;
-};
-
-#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
-UPGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry upgrade_table[] = {
-#define x(ver, passes, ...) { \
- .recovery_passes = passes, \
- .version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
- .errors = upgrade_##ver##_errors, \
-},
-UPGRADE_TABLE()
-#undef x
-};
-
-static int have_stripes(struct bch_fs *c)
-{
- if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
- return 0;
-
- return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
-}
-
-int bch2_sb_set_upgrade_extra(struct bch_fs *c)
-{
- unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
- unsigned new_version = c->sb.version;
- bool write_sb = false;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- if (old_version < bcachefs_metadata_version_bucket_stripe_sectors &&
- new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
- (ret = have_stripes(c) > 0)) {
- __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
- write_sb = true;
- }
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret < 0 ? ret : 0;
-}
-
-void bch2_sb_set_upgrade(struct bch_fs *c,
- unsigned old_version,
- unsigned new_version)
-{
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- for (const struct upgrade_downgrade_entry *i = upgrade_table;
- i < upgrade_table + ARRAY_SIZE(upgrade_table);
- i++)
- if (i->version > old_version && i->version <= new_version) {
- u64 passes = i->recovery_passes;
-
- if (passes & RECOVERY_PASS_ALL_FSCK)
- passes |= bch2_fsck_recovery_passes();
- passes &= ~RECOVERY_PASS_ALL_FSCK;
-
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(passes));
-
- for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
- __set_bit_le64(*e, ext->errors_silent);
- }
-}
-
-#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
-DOWNGRADE_TABLE()
-#undef x
-
-static const struct upgrade_downgrade_entry downgrade_table[] = {
-#define x(ver, passes, ...) { \
- .recovery_passes = passes, \
- .version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \
- .errors = downgrade_##ver##_errors, \
-},
-DOWNGRADE_TABLE()
-#undef x
-};
-
-static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
-{
- unsigned dst_offset = table->nr;
- struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
- unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
- int ret = 0;
-
- unsigned nr_errors = le16_to_cpu(dst->nr_errors);
-
- switch (le16_to_cpu(dst->version)) {
- case bcachefs_metadata_version_bucket_stripe_sectors:
- if (have_stripes(c)) {
- bytes += sizeof(dst->errors[0]) * 2;
-
- ret = darray_make_room(table, bytes);
- if (ret)
- return ret;
-
- dst = (void *) &table->data[dst_offset];
- dst->nr_errors = cpu_to_le16(nr_errors + 1);
-
- /* open coded __set_bit_le64, as dst is packed and
- * dst->recovery_passes is misaligned */
- unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
- dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
-
- dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
- }
- break;
- }
-
- return ret;
-}
-
-static inline const struct bch_sb_field_downgrade_entry *
-downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
-{
- return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
-}
-
-#define for_each_downgrade_entry(_d, _i) \
- for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
- (void *) _i < vstruct_end(&(_d)->field) && \
- (void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \
- (void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \
- _i = downgrade_entry_next_c(_i))
-
-static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
- for (const struct bch_sb_field_downgrade_entry *i = e->entries;
- (void *) i < vstruct_end(&e->field);
- i = downgrade_entry_next_c(i)) {
- /*
- * Careful: sb_field_downgrade_entry is only 2 byte aligned, but
- * section sizes are 8 byte aligned - an empty entry spanning
- * the end of the section is allowed (and ignored):
- */
- if ((void *) &i->errors[0] > vstruct_end(&e->field))
- break;
-
- if (flags & BCH_VALIDATE_write &&
- (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
- prt_printf(err, "downgrade entry overruns end of superblock section");
- return -BCH_ERR_invalid_sb_downgrade;
- }
-
- if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
- BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
- prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
- BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
- BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
- return -BCH_ERR_invalid_sb_downgrade;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
-
- if (out->nr_tabstops <= 1)
- printbuf_tabstop_push(out, 16);
-
- for_each_downgrade_entry(e, i) {
- prt_str(out, "version:\t");
- bch2_version_to_text(out, le16_to_cpu(i->version));
- prt_newline(out);
-
- prt_str(out, "recovery passes:\t");
- prt_bitflags(out, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
- prt_newline(out);
-
- prt_str(out, "errors:\t");
- bool first = true;
- for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
- if (!first)
- prt_char(out, ',');
- first = false;
- bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
- }
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
- .validate = bch2_sb_downgrade_validate,
- .to_text = bch2_sb_downgrade_to_text,
-};
-
-int bch2_sb_downgrade_update(struct bch_fs *c)
-{
- if (!test_bit(BCH_FS_btree_running, &c->flags))
- return 0;
-
- darray_char table = {};
- int ret = 0;
-
- for (const struct upgrade_downgrade_entry *src = downgrade_table;
- src < downgrade_table + ARRAY_SIZE(downgrade_table);
- src++) {
- if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
- continue;
-
- if (src->version < c->sb.version_incompat)
- continue;
-
- struct bch_sb_field_downgrade_entry *dst;
- unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
-
- ret = darray_make_room(&table, bytes);
- if (ret)
- goto out;
-
- dst = (void *) &darray_top(table);
- dst->version = cpu_to_le16(src->version);
- dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
- dst->recovery_passes[1] = 0;
- dst->nr_errors = cpu_to_le16(src->nr_errors);
- for (unsigned i = 0; i < src->nr_errors; i++)
- dst->errors[i] = cpu_to_le16(src->errors[i]);
-
- ret = downgrade_table_extra(c, &table);
- if (ret)
- goto out;
-
- if (!dst->recovery_passes[0] &&
- !dst->recovery_passes[1] &&
- !dst->nr_errors)
- continue;
-
- table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
- }
-
- struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
-
- unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
-
- if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
- goto out;
-
- d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
- if (!d) {
- ret = bch_err_throw(c, ENOSPC_sb_downgrade);
- goto out;
- }
-
- memcpy(d->entries, table.data, table.nr);
- memset_u64s_tail(d->entries, 0, table.nr);
-out:
- darray_exit(&table);
- return ret;
-}
-
-void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
-{
- struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
- if (!d)
- return;
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-
- for_each_downgrade_entry(d, i) {
- unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
- if (new_minor < minor && minor <= old_minor) {
- ext->recovery_passes_required[0] |= i->recovery_passes[0];
- ext->recovery_passes_required[1] |= i->recovery_passes[1];
-
- for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
- unsigned e = le16_to_cpu(i->errors[j]);
- if (e < BCH_FSCK_ERR_MAX)
- __set_bit(e, c->sb.errors_silent);
- if (e < sizeof(ext->errors_silent) * 8)
- __set_bit_le64(e, ext->errors_silent);
- }
- }
- }
-}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
deleted file mode 100644
index 095b7cc9bb47..000000000000
--- a/fs/bcachefs/sb-downgrade.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_H
-#define _BCACHEFS_SB_DOWNGRADE_H
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
-
-int bch2_sb_downgrade_update(struct bch_fs *);
-void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
-int bch2_sb_set_upgrade_extra(struct bch_fs *);
-void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h
deleted file mode 100644
index cffd932be3ec..000000000000
--- a/fs/bcachefs/sb-downgrade_format.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
-
-struct bch_sb_field_downgrade_entry {
- __le16 version;
- __le64 recovery_passes[2];
- __le16 nr_errors;
- __le16 errors[] __counted_by(nr_errors);
-} __packed __aligned(2);
-
-struct bch_sb_field_downgrade {
- struct bch_sb_field field;
- struct bch_sb_field_downgrade_entry entries[];
-};
-
-#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
deleted file mode 100644
index 48853efdc105..000000000000
--- a/fs/bcachefs/sb-errors.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "sb-errors.h"
-#include "super-io.h"
-
-const char * const bch2_sb_error_strs[] = {
-#define x(t, n, ...) [n] = #t,
- BCH_SB_ERRS()
-#undef x
-};
-
-void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
-{
- if (id < BCH_FSCK_ERR_MAX)
- prt_str(out, bch2_sb_error_strs[id]);
- else
- prt_printf(out, "(unknown error %u)", id);
-}
-
-static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
-{
- return bch2_sb_field_nr_entries(e);
-}
-
-static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
-{
- return (sizeof(struct bch_sb_field_errors) +
- sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
-}
-
-static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_errors *e = field_to_type(f, errors);
- unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
- for (i = 0; i < nr; i++) {
- if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
- prt_printf(err, "entry with count 0 (id ");
- bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
- prt_printf(err, ")");
- return -BCH_ERR_invalid_sb_errors;
- }
-
- if (i + 1 < nr &&
- BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
- BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
- prt_printf(err, "entries out of order");
- return -BCH_ERR_invalid_sb_errors;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_errors *e = field_to_type(f, errors);
- unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
-
- if (out->nr_tabstops <= 1)
- printbuf_tabstop_push(out, 16);
-
- for (i = 0; i < nr; i++) {
- bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
- prt_tab(out);
- prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
- prt_tab(out);
- bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
- prt_newline(out);
- }
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_errors = {
- .validate = bch2_sb_errors_validate,
- .to_text = bch2_sb_errors_to_text,
-};
-
-void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
-{
- if (out->nr_tabstops < 1)
- printbuf_tabstop_push(out, 48);
- if (out->nr_tabstops < 2)
- printbuf_tabstop_push(out, 8);
- if (out->nr_tabstops < 3)
- printbuf_tabstop_push(out, 16);
-
- guard(mutex)(&c->fsck_error_counts_lock);
-
- bch_sb_errors_cpu *e = &c->fsck_error_counts;
- darray_for_each(*e, i) {
- bch2_sb_error_id_to_text(out, i->id);
- prt_tab(out);
- prt_u64(out, i->nr);
- prt_tab(out);
- bch2_prt_datetime(out, i->last_error_time);
- prt_newline(out);
- }
-}
-
-void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
-{
- bch_sb_errors_cpu *e = &c->fsck_error_counts;
- struct bch_sb_error_entry_cpu n = {
- .id = err,
- .nr = 1,
- .last_error_time = ktime_get_real_seconds()
- };
- unsigned i;
-
- mutex_lock(&c->fsck_error_counts_lock);
- for (i = 0; i < e->nr; i++) {
- if (err == e->data[i].id) {
- e->data[i].nr++;
- e->data[i].last_error_time = n.last_error_time;
- goto out;
- }
- if (err < e->data[i].id)
- break;
- }
-
- if (darray_make_room(e, 1))
- goto out;
-
- darray_insert_item(e, i, n);
-out:
- mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-void bch2_sb_errors_from_cpu(struct bch_fs *c)
-{
- bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst;
- unsigned i;
-
- mutex_lock(&c->fsck_error_counts_lock);
-
- dst = bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
-
- if (!dst)
- goto err;
-
- for (i = 0; i < src->nr; i++) {
- SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
- SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
- dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
- }
-
-err:
- mutex_unlock(&c->fsck_error_counts_lock);
-}
-
-static int bch2_sb_errors_to_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
- bch_sb_errors_cpu *dst = &c->fsck_error_counts;
- unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
- int ret;
-
- if (!nr)
- return 0;
-
- mutex_lock(&c->fsck_error_counts_lock);
- ret = darray_make_room(dst, nr);
- if (ret)
- goto err;
-
- dst->nr = nr;
-
- for (i = 0; i < nr; i++) {
- dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
- dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
- dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
- }
-err:
- mutex_unlock(&c->fsck_error_counts_lock);
-
- return ret;
-}
-
-void bch2_fs_sb_errors_exit(struct bch_fs *c)
-{
- darray_exit(&c->fsck_error_counts);
-}
-
-void bch2_fs_sb_errors_init_early(struct bch_fs *c)
-{
- mutex_init(&c->fsck_error_counts_lock);
- darray_init(&c->fsck_error_counts);
-}
-
-int bch2_fs_sb_errors_init(struct bch_fs *c)
-{
- return bch2_sb_errors_to_cpu(c);
-}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
deleted file mode 100644
index e86267264692..000000000000
--- a/fs/bcachefs/sb-errors.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_H
-#define _BCACHEFS_SB_ERRORS_H
-
-#include "sb-errors_types.h"
-
-extern const char * const bch2_sb_error_strs[];
-
-void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
-void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
-
-void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
-
-void bch2_sb_errors_from_cpu(struct bch_fs *);
-
-void bch2_fs_sb_errors_exit(struct bch_fs *);
-void bch2_fs_sb_errors_init_early(struct bch_fs *);
-int bch2_fs_sb_errors_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
deleted file mode 100644
index d154b7651d28..000000000000
--- a/fs/bcachefs/sb-errors_format.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
-#define _BCACHEFS_SB_ERRORS_FORMAT_H
-
-enum bch_fsck_flags {
- FSCK_CAN_FIX = BIT(0),
- FSCK_CAN_IGNORE = BIT(1),
- FSCK_AUTOFIX = BIT(2),
- FSCK_ERR_NO_LOG = BIT(3),
-};
-
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0, 0) \
- x(dirty_but_no_journal_entries, 1, 0) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \
- x(sb_clean_journal_seq_mismatch, 3, 0) \
- x(sb_clean_btree_root_mismatch, 4, 0) \
- x(sb_clean_missing, 5, 0) \
- x(jset_unsupported_version, 6, 0) \
- x(jset_unknown_csum, 7, 0) \
- x(jset_last_seq_newer_than_seq, 8, 0) \
- x(jset_past_bucket_end, 9, 0) \
- x(jset_seq_blacklisted, 10, 0) \
- x(journal_entries_missing, 11, 0) \
- x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
- x(journal_entry_past_jset_end, 13, 0) \
- x(journal_entry_replicas_data_mismatch, 14, 0) \
- x(journal_entry_bkey_u64s_0, 15, 0) \
- x(journal_entry_bkey_past_end, 16, 0) \
- x(journal_entry_bkey_bad_format, 17, 0) \
- x(journal_entry_bkey_invalid, 18, 0) \
- x(journal_entry_btree_root_bad_size, 19, 0) \
- x(journal_entry_blacklist_bad_size, 20, 0) \
- x(journal_entry_blacklist_v2_bad_size, 21, 0) \
- x(journal_entry_blacklist_v2_start_past_end, 22, 0) \
- x(journal_entry_usage_bad_size, 23, 0) \
- x(journal_entry_data_usage_bad_size, 24, 0) \
- x(journal_entry_clock_bad_size, 25, 0) \
- x(journal_entry_clock_bad_rw, 26, 0) \
- x(journal_entry_dev_usage_bad_size, 27, 0) \
- x(journal_entry_dev_usage_bad_dev, 28, 0) \
- x(journal_entry_dev_usage_bad_pad, 29, 0) \
- x(btree_node_unreadable, 30, 0) \
- x(btree_node_fault_injected, 31, 0) \
- x(btree_node_bad_magic, 32, 0) \
- x(btree_node_bad_seq, 33, 0) \
- x(btree_node_unsupported_version, 34, 0) \
- x(btree_node_bset_older_than_sb_min, 35, 0) \
- x(btree_node_bset_newer_than_sb, 36, 0) \
- x(btree_node_data_missing, 37, FSCK_AUTOFIX) \
- x(btree_node_bset_after_end, 38, 0) \
- x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
- x(btree_node_replicas_data_mismatch, 40, 0) \
- x(bset_unknown_csum, 41, 0) \
- x(bset_bad_csum, 42, 0) \
- x(bset_past_end_of_btree_node, 43, 0) \
- x(bset_wrong_sector_offset, 44, 0) \
- x(bset_empty, 45, 0) \
- x(bset_bad_seq, 46, 0) \
- x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \
- x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \
- x(btree_node_bad_btree, 49, 0) \
- x(btree_node_bad_level, 50, 0) \
- x(btree_node_bad_min_key, 51, 0) \
- x(btree_node_bad_max_key, 52, 0) \
- x(btree_node_bad_format, 53, 0) \
- x(btree_node_bkey_past_bset_end, 54, 0) \
- x(btree_node_bkey_bad_format, 55, 0) \
- x(btree_node_bad_bkey, 56, 0) \
- x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \
- x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \
- x(btree_root_read_error, 59, FSCK_AUTOFIX) \
- x(btree_root_bad_min_key, 60, 0) \
- x(btree_root_bad_max_key, 61, 0) \
- x(btree_node_read_error, 62, FSCK_AUTOFIX) \
- x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
- x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
- x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \
- x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \
- x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \
- x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
- x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
- x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
- x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \
- x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \
- x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \
- x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \
- x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \
- x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \
- x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \
- x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \
- x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \
- x(bkey_version_in_future, 80, 0) \
- x(bkey_u64s_too_small, 81, 0) \
- x(bkey_invalid_type_for_btree, 82, 0) \
- x(bkey_extent_size_zero, 83, 0) \
- x(bkey_extent_size_greater_than_offset, 84, 0) \
- x(bkey_size_nonzero, 85, 0) \
- x(bkey_snapshot_nonzero, 86, 0) \
- x(bkey_snapshot_zero, 87, 0) \
- x(bkey_at_pos_max, 88, 0) \
- x(bkey_before_start_of_btree_node, 89, 0) \
- x(bkey_after_end_of_btree_node, 90, 0) \
- x(bkey_val_size_nonzero, 91, 0) \
- x(bkey_val_size_too_small, 92, 0) \
- x(alloc_v1_val_size_bad, 93, 0) \
- x(alloc_v2_unpack_error, 94, 0) \
- x(alloc_v3_unpack_error, 95, 0) \
- x(alloc_v4_val_size_bad, 96, 0) \
- x(alloc_v4_backpointers_start_bad, 97, 0) \
- x(alloc_key_data_type_bad, 98, 0) \
- x(alloc_key_empty_but_have_data, 99, 0) \
- x(alloc_key_dirty_sectors_0, 100, 0) \
- x(alloc_key_data_type_inconsistency, 101, 0) \
- x(alloc_key_to_missing_dev_bucket, 102, 0) \
- x(alloc_key_cached_inconsistency, 103, 0) \
- x(alloc_key_cached_but_read_time_zero, 104, FSCK_AUTOFIX) \
- x(alloc_key_to_missing_lru_entry, 105, FSCK_AUTOFIX) \
- x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
- x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
- x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
- x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
- x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
- x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
- x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \
- x(bucket_sector_count_overflow, 112, 0) \
- x(bucket_metadata_type_mismatch, 113, 0) \
- x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \
- x(freespace_key_wrong, 115, FSCK_AUTOFIX) \
- x(freespace_hole_missing, 116, FSCK_AUTOFIX) \
- x(bucket_gens_val_size_bad, 117, 0) \
- x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \
- x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \
- x(bucket_gens_to_invalid_dev, 120, FSCK_AUTOFIX) \
- x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
- x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \
- x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
- x(backpointer_bucket_offset_wrong, 125, 0) \
- x(backpointer_level_bad, 294, 0) \
- x(backpointer_dev_bad, 297, 0) \
- x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \
- x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \
- x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \
- x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \
- x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \
- x(lru_entry_bad, 131, FSCK_AUTOFIX) \
- x(btree_ptr_val_too_big, 132, 0) \
- x(btree_ptr_v2_val_too_big, 133, 0) \
- x(btree_ptr_has_non_ptr, 134, 0) \
- x(extent_ptrs_invalid_entry, 135, 0) \
- x(extent_ptrs_no_ptrs, 136, 0) \
- x(extent_ptrs_too_many_ptrs, 137, 0) \
- x(extent_ptrs_redundant_crc, 138, 0) \
- x(extent_ptrs_redundant_stripe, 139, 0) \
- x(extent_ptrs_unwritten, 140, 0) \
- x(extent_ptrs_written_and_unwritten, 141, 0) \
- x(ptr_to_invalid_device, 142, 0) \
- x(ptr_to_duplicate_device, 143, 0) \
- x(ptr_after_last_bucket, 144, 0) \
- x(ptr_before_first_bucket, 145, 0) \
- x(ptr_spans_multiple_buckets, 146, 0) \
- x(ptr_to_missing_backpointer, 147, FSCK_AUTOFIX) \
- x(ptr_to_missing_alloc_key, 148, FSCK_AUTOFIX) \
- x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
- x(ptr_to_missing_stripe, 150, 0) \
- x(ptr_to_incorrect_stripe, 151, 0) \
- x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \
- x(ptr_too_stale, 153, 0) \
- x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
- x(ptr_bucket_data_type_mismatch, 155, 0) \
- x(ptr_cached_and_erasure_coded, 156, 0) \
- x(ptr_crc_uncompressed_size_too_small, 157, 0) \
- x(ptr_crc_uncompressed_size_too_big, 161, 0) \
- x(ptr_crc_uncompressed_size_mismatch, 300, 0) \
- x(ptr_crc_csum_type_unknown, 158, 0) \
- x(ptr_crc_compression_type_unknown, 159, 0) \
- x(ptr_crc_redundant, 160, 0) \
- x(ptr_crc_nonce_mismatch, 162, 0) \
- x(ptr_stripe_redundant, 163, 0) \
- x(extent_flags_not_at_start, 306, 0) \
- x(reservation_key_nr_replicas_invalid, 164, 0) \
- x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
- x(reflink_v_pos_bad, 292, 0) \
- x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
- x(reflink_refcount_underflow, 293, 0) \
- x(stripe_pos_bad, 167, 0) \
- x(stripe_val_size_bad, 168, 0) \
- x(stripe_csum_granularity_bad, 290, 0) \
- x(stripe_sector_count_wrong, 169, 0) \
- x(snapshot_tree_pos_bad, 170, 0) \
- x(snapshot_tree_to_missing_snapshot, 171, 0) \
- x(snapshot_tree_to_missing_subvol, 172, 0) \
- x(snapshot_tree_to_wrong_subvol, 173, 0) \
- x(snapshot_tree_to_snapshot_subvol, 174, 0) \
- x(snapshot_pos_bad, 175, 0) \
- x(snapshot_parent_bad, 176, 0) \
- x(snapshot_children_not_normalized, 177, 0) \
- x(snapshot_child_duplicate, 178, 0) \
- x(snapshot_child_bad, 179, 0) \
- x(snapshot_skiplist_not_normalized, 180, 0) \
- x(snapshot_skiplist_bad, 181, 0) \
- x(snapshot_should_not_have_subvol, 182, 0) \
- x(snapshot_to_bad_snapshot_tree, 183, FSCK_AUTOFIX) \
- x(snapshot_bad_depth, 184, 0) \
- x(snapshot_bad_skiplist, 185, 0) \
- x(subvol_pos_bad, 186, 0) \
- x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \
- x(subvol_to_missing_root, 188, 0) \
- x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \
- x(bkey_in_missing_snapshot, 190, 0) \
- x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \
- x(inode_pos_inode_nonzero, 191, 0) \
- x(inode_pos_blockdev_range, 192, 0) \
- x(inode_alloc_cursor_inode_bad, 301, 0) \
- x(inode_unpack_error, 193, 0) \
- x(inode_str_hash_invalid, 194, 0) \
- x(inode_v3_fields_start_bad, 195, 0) \
- x(inode_snapshot_mismatch, 196, 0) \
- x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \
- x(inode_unlinked_but_clean, 197, 0) \
- x(inode_unlinked_but_nlink_nonzero, 198, 0) \
- x(inode_unlinked_and_not_open, 281, 0) \
- x(inode_unlinked_but_has_dirent, 285, 0) \
- x(inode_checksum_type_invalid, 199, 0) \
- x(inode_compression_type_invalid, 200, 0) \
- x(inode_subvol_root_but_not_dir, 201, 0) \
- x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
- x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
- x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
- x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
- x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
- x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
- x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
- x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \
- x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
- x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
- x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
- x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \
- x(inode_unreachable, 210, FSCK_AUTOFIX) \
- x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
- x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \
- x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \
- x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \
- x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \
- x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \
- x(vfs_bad_inode_rm, 320, 0) \
- x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
- x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
- x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
- x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
- x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
- x(extent_overlapping, 215, 0) \
- x(key_in_missing_inode, 216, FSCK_AUTOFIX) \
- x(key_in_wrong_inode_type, 217, 0) \
- x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \
- x(dirent_empty_name, 219, 0) \
- x(dirent_val_too_big, 220, 0) \
- x(dirent_name_too_long, 221, 0) \
- x(dirent_name_embedded_nul, 222, 0) \
- x(dirent_name_dot_or_dotdot, 223, 0) \
- x(dirent_name_has_slash, 224, 0) \
- x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \
- x(inode_bi_parent_wrong, 226, 0) \
- x(dirent_in_missing_dir_inode, 227, 0) \
- x(dirent_in_non_dir_inode, 228, 0) \
- x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \
- x(dirent_to_overwritten_inode, 302, 0) \
- x(dirent_to_missing_subvol, 230, 0) \
- x(dirent_to_itself, 231, 0) \
- x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \
- x(quota_type_invalid, 232, 0) \
- x(xattr_val_size_too_small, 233, 0) \
- x(xattr_val_size_too_big, 234, 0) \
- x(xattr_invalid_type, 235, 0) \
- x(xattr_name_invalid_chars, 236, 0) \
- x(xattr_in_missing_inode, 237, 0) \
- x(root_subvol_missing, 238, 0) \
- x(root_dir_missing, 239, 0) \
- x(root_inode_not_dir, 240, 0) \
- x(dir_loop, 241, 0) \
- x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \
- x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \
- x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
- x(reflink_p_front_pad_bad, 245, 0) \
- x(journal_entry_dup_same_device, 246, 0) \
- x(inode_bi_subvol_missing, 247, 0) \
- x(inode_bi_subvol_wrong, 248, 0) \
- x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \
- x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \
- x(inode_bi_parent_nonzero, 251, 0) \
- x(dirent_to_missing_parent_subvol, 252, 0) \
- x(dirent_not_visible_in_parent_subvol, 253, 0) \
- x(subvol_fs_path_parent_wrong, 254, 0) \
- x(subvol_root_fs_path_parent_nonzero, 255, 0) \
- x(subvol_children_not_set, 256, 0) \
- x(subvol_children_bad, 257, 0) \
- x(subvol_loop, 258, 0) \
- x(subvol_unreachable, 259, FSCK_AUTOFIX) \
- x(btree_node_bkey_bad_u64s, 260, 0) \
- x(btree_node_topology_empty_interior_node, 261, 0) \
- x(btree_ptr_v2_min_key_bad, 262, 0) \
- x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
- x(snapshot_node_missing, 264, FSCK_AUTOFIX) \
- x(dup_backpointer_to_bad_csum_extent, 265, 0) \
- x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
- x(sb_clean_entry_overrun, 267, 0) \
- x(btree_ptr_v2_written_0, 268, 0) \
- x(subvol_snapshot_bad, 269, 0) \
- x(subvol_inode_bad, 270, 0) \
- x(subvol_missing, 308, FSCK_AUTOFIX) \
- x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
- x(accounting_mismatch, 272, FSCK_AUTOFIX) \
- x(accounting_replicas_not_marked, 273, 0) \
- x(accounting_to_invalid_device, 289, 0) \
- x(invalid_btree_id, 274, FSCK_AUTOFIX) \
- x(alloc_key_io_time_bad, 275, 0) \
- x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
- x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
- x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
- x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
- x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
- x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
- x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \
- x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
- x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
- x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
- x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
- x(dirent_cf_name_too_big, 304, 0) \
- x(dirent_stray_data_after_cf_name, 305, 0) \
- x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
- x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 321, 0)
-
-enum bch_sb_error_id {
-#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
- BCH_SB_ERRS()
-#undef x
-};
-
-struct bch_sb_field_errors {
- struct bch_sb_field field;
- struct bch_sb_field_error_entry {
- __le64 v;
- __le64 last_error_time;
- } entries[];
-};
-
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
-LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
-
-#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
deleted file mode 100644
index 40325239c3b0..000000000000
--- a/fs/bcachefs/sb-errors_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
-#define _BCACHEFS_SB_ERRORS_TYPES_H
-
-#include "darray.h"
-
-struct bch_sb_error_entry_cpu {
- u64 id:16,
- nr:48;
- u64 last_error_time;
-};
-
-typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
-
-#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
deleted file mode 100644
index 6245e342a8a8..000000000000
--- a/fs/bcachefs/sb-members.c
+++ /dev/null
@@ -1,606 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "disk_groups.h"
-#include "error.h"
-#include "opts.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-members.h"
-#include "super-io.h"
-
-int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
-{
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
- bch2_bkey_val_to_text(&buf, c, k);
-
- bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
-
- int ret = bch2_run_explicit_recovery_pass(c, &buf,
- BCH_RECOVERY_PASS_check_allocations, 0);
-
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
-{
- if (dev != BCH_SB_MEMBER_INVALID)
- bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
-}
-
-void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
-{
- bch2_fs_inconsistent(ca->fs,
- "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
- bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
-}
-
-#define x(t, n, ...) [n] = #t,
-static const char * const bch2_iops_measurements[] = {
- BCH_IOPS_MEASUREMENTS()
- NULL
-};
-
-char * const bch2_member_error_strs[] = {
- BCH_MEMBER_ERROR_TYPES()
- NULL
-};
-#undef x
-
-/* Code for bch_sb_field_members_v1: */
-
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
-{
- return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
-}
-
-static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
-{
- struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
- memset(&ret, 0, sizeof(ret));
- memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
- return ret;
-}
-
-static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
-{
- return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
-}
-
-static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
-{
- struct bch_member ret, *p = members_v1_get_mut(mi, i);
- memset(&ret, 0, sizeof(ret));
- memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
- return ret;
-}
-
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
-{
- struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
- if (mi2)
- return members_v2_get(mi2, i);
- struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
- return members_v1_get(mi1, i);
-}
-
-static int sb_members_v2_resize_entries(struct bch_fs *c)
-{
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
- if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
- unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
- c->disk_sb.sb->nr_devices), 8);
-
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi)
- return bch_err_throw(c, ENOSPC_sb_members_v2);
-
- for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
- void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
- memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
- memset(dst + le16_to_cpu(mi->member_bytes),
- 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
- }
- mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
- }
- return 0;
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c)
-{
- struct bch_sb_field_members_v1 *mi1;
- struct bch_sb_field_members_v2 *mi2;
-
- if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
- mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
- DIV_ROUND_UP(sizeof(*mi2) +
- sizeof(struct bch_member) * c->sb.nr_devices,
- sizeof(u64)));
- mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
- memcpy(&mi2->_members[0], &mi1->_members[0],
- BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
- memset(&mi2->pad[0], 0, sizeof(mi2->pad));
- mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
- }
-
- return sb_members_v2_resize_entries(c);
-}
-
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
-{
- struct bch_sb_field_members_v1 *mi1;
- struct bch_sb_field_members_v2 *mi2;
-
- if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
- bch2_sb_field_resize(disk_sb, members_v1, 0);
- return 0;
- }
-
- mi1 = bch2_sb_field_resize(disk_sb, members_v1,
- DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
- disk_sb->sb->nr_devices, sizeof(u64)));
- if (!mi1)
- return -BCH_ERR_ENOSPC_sb_members;
-
- mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
-
- for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
- memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
-
- return 0;
-}
-
-static int validate_member(struct printbuf *err,
- struct bch_member m,
- struct bch_sb *sb,
- int i)
-{
- if (le64_to_cpu(m.nbuckets) > BCH_MEMBER_NBUCKETS_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %u)",
- i, le64_to_cpu(m.nbuckets), BCH_MEMBER_NBUCKETS_MAX);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le64_to_cpu(m.nbuckets) -
- le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
- prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
- i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le16_to_cpu(m.bucket_size) <
- le16_to_cpu(sb->block_size)) {
- prt_printf(err, "device %u: bucket size %u smaller than block size %u",
- i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le16_to_cpu(m.bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(sb)) {
- prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
- i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
- prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
- sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
- prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
- return -BCH_ERR_invalid_sb_members;
- }
-
- return 0;
-}
-
-static void member_to_text(struct printbuf *out,
- struct bch_member m,
- struct bch_sb_field_disk_groups *gi,
- struct bch_sb *sb,
- int i)
-{
- unsigned data_have = bch2_sb_dev_has_data(sb, i);
- u64 bucket_size = le16_to_cpu(m.bucket_size);
- u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
-
- if (!bch2_member_alive(&m))
- return;
-
- prt_printf(out, "Device:\t%u\n", i);
-
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "Label:\t");
- if (BCH_MEMBER_GROUP(&m))
- bch2_disk_path_to_text_sb(out, sb,
- BCH_MEMBER_GROUP(&m) - 1);
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "UUID:\t");
- pr_uuid(out, m.uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Size:\t");
- prt_units_u64(out, device_size << 9);
- prt_newline(out);
-
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
- prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i]));
-
- for (unsigned i = 0; i < BCH_IOPS_NR; i++)
- prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i]));
-
- prt_printf(out, "Bucket size:\t");
- prt_units_u64(out, bucket_size << 9);
- prt_newline(out);
-
- prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket));
- prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets));
-
- prt_printf(out, "Last mount:\t");
- if (m.last_mount)
- bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
- else
- prt_printf(out, "(never)");
- prt_newline(out);
-
- prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq));
-
- prt_printf(out, "State:\t%s\n",
- BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
- ? bch2_member_states[BCH_MEMBER_STATE(&m)]
- : "unknown");
-
- prt_printf(out, "Data allowed:\t");
- if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "Has data:\t");
- if (data_have)
- prt_bitflags(out, __bch2_data_types, data_have);
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "Btree allocated bitmap blocksize:\t");
- if (m.btree_bitmap_shift < 64)
- prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
- else
- prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
- prt_newline(out);
-
- prt_printf(out, "Btree allocated bitmap:\t");
- bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64);
- prt_newline(out);
-
- prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
-
- prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
- prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
- prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
-
- printbuf_indent_sub(out, 2);
-}
-
-static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
- unsigned i;
-
- if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
- prt_printf(err, "too many devices for section size");
- return -BCH_ERR_invalid_sb_members;
- }
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member m = members_v1_get(mi, i);
-
- int ret = validate_member(err, m, sb, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
- struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
-
- if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
- prt_printf(out, "field ends before start of entries");
- return;
- }
-
- unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]);
- if (nr != sb->nr_devices)
- prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
-
- for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
- member_to_text(out, members_v1_get(mi, i), gi, sb, i);
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
- .validate = bch2_sb_members_v1_validate,
- .to_text = bch2_sb_members_v1_to_text,
-};
-
-static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
- struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
-
- if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
- prt_printf(out, "field ends before start of entries");
- return;
- }
-
- if (!le16_to_cpu(mi->member_bytes)) {
- prt_printf(out, "member_bytes 0");
- return;
- }
-
- unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes);
- if (nr != sb->nr_devices)
- prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
-
- /*
- * We call to_text() on superblock sections that haven't passed
- * validate, so we can't trust sb->nr_devices.
- */
-
- for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
- member_to_text(out, members_v2_get(mi, i), gi, sb, i);
-}
-
-static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
- size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
- (void *) mi;
-
- if (mi_bytes > vstruct_bytes(&mi->field)) {
- prt_printf(err, "section too small (%zu > %zu)",
- mi_bytes, vstruct_bytes(&mi->field));
- return -BCH_ERR_invalid_sb_members;
- }
-
- for (unsigned i = 0; i < sb->nr_devices; i++) {
- int ret = validate_member(err, members_v2_get(mi, i), sb, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
- .validate = bch2_sb_members_v2_validate,
- .to_text = bch2_sb_members_v2_to_text,
-};
-
-void bch2_sb_members_from_cpu(struct bch_fs *c)
-{
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
-
- guard(rcu)();
- for_each_member_device_rcu(c, ca, NULL) {
- struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
-
- for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
- m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
- }
-}
-
-void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_member m;
-
- mutex_lock(&ca->fs->sb_lock);
- m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
- mutex_unlock(&ca->fs->sb_lock);
-
- printbuf_tabstop_push(out, 12);
-
- prt_str(out, "IO errors since filesystem creation");
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
- prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i]));
- printbuf_indent_sub(out, 2);
-
- prt_str(out, "IO errors since ");
- bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
- prt_str(out, " ago");
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
- for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++)
- prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i],
- atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
- printbuf_indent_sub(out, 2);
-}
-
-void bch2_dev_errors_reset(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_member *m;
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
- m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
- m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
-
-/*
- * Per member "range has btree nodes" bitmap:
- *
- * This is so that if we ever have to run the btree node scan to repair we don't
- * have to scan full devices:
- */
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
-{
- guard(rcu)();
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (ca &&
- !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
- return false;
- }
- return true;
-}
-
-static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
- u64 start, unsigned sectors)
-{
- struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
- u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
-
- u64 end = start + sectors;
-
- int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
- if (resize > 0) {
- u64 new_bitmap = 0;
-
- for (unsigned i = 0; i < 64; i++)
- if (bitmap & BIT_ULL(i))
- new_bitmap |= BIT_ULL(i >> resize);
- bitmap = new_bitmap;
- m->btree_bitmap_shift += resize;
- }
-
- BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
- BUG_ON(end > 64ULL << m->btree_bitmap_shift);
-
- for (unsigned bit = start >> m->btree_bitmap_shift;
- (u64) bit << m->btree_bitmap_shift < end;
- bit++)
- bitmap |= BIT_ULL(bit);
-
- m->btree_allocated_bitmap = cpu_to_le64(bitmap);
-}
-
-void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
-{
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (!bch2_member_exists(c->disk_sb.sb, ptr->dev))
- continue;
-
- __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
- }
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
-{
- unsigned nr = 0;
-
- for (unsigned i = 0; i < sb->nr_devices; i++)
- nr += bch2_member_exists((struct bch_sb *) sb, i);
- return nr;
-}
-
-int bch2_sb_member_alloc(struct bch_fs *c)
-{
- unsigned dev_idx = c->sb.nr_devices;
- struct bch_sb_field_members_v2 *mi;
- unsigned nr_devices;
- unsigned u64s;
- int best = -1;
- u64 best_last_mount = 0;
- unsigned nr_deleted = 0;
-
- if (dev_idx < BCH_SB_MEMBERS_MAX)
- goto have_slot;
-
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
- /* eventually BCH_SB_MEMBERS_MAX will be raised */
- if (dev_idx == BCH_SB_MEMBER_INVALID)
- continue;
-
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
-
- nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
-
- if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
- continue;
-
- u64 last_mount = le64_to_cpu(m.last_mount);
- if (best < 0 || last_mount < best_last_mount) {
- best = dev_idx;
- best_last_mount = last_mount;
- }
- }
- if (best >= 0) {
- dev_idx = best;
- goto have_slot;
- }
-
- if (nr_deleted)
- bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
- nr_deleted);
-
- return -BCH_ERR_ENOSPC_sb_members;
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
- mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
- le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
-
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi)
- return -BCH_ERR_ENOSPC_sb_members;
-
- c->disk_sb.sb->nr_devices = nr_devices;
- return dev_idx;
-}
-
-void bch2_sb_members_clean_deleted(struct bch_fs *c)
-{
- mutex_lock(&c->sb_lock);
- bool write_sb = false;
-
- for (unsigned i = 0; i < c->sb.nr_devices; i++) {
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
-
- if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
- memset(&m->uuid, 0, sizeof(m->uuid));
- write_sb = true;
- }
- }
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
deleted file mode 100644
index 8d8a8a857648..000000000000
--- a/fs/bcachefs/sb-members.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_H
-#define _BCACHEFS_SB_MEMBERS_H
-
-#include "darray.h"
-#include "bkey_types.h"
-#include "enumerated_ref.h"
-
-extern char * const bch2_member_error_strs[];
-
-static inline struct bch_member *
-__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
-{
- return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
-}
-
-int bch2_sb_members_v2_init(struct bch_fs *c);
-int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
-struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
-struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
- return !enumerated_ref_is_zero(&ca->io_ref[READ]);
-}
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
-
-static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
-{
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
- return ca && bch2_dev_is_online(ca);
-}
-
-static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
-{
- return bch2_dev_is_online(ca) &&
- ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
- return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
- unsigned dev)
-{
- darray_for_each(devs, i)
- if (*i == dev)
- return true;
- return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
- unsigned dev)
-{
- darray_for_each(*devs, i)
- if (*i == dev) {
- darray_remove_item(devs, i);
- return;
- }
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
- unsigned dev)
-{
- if (!bch2_dev_list_has_dev(*devs, dev)) {
- BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
- devs->data[devs->nr++] = dev;
- }
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
- return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
- const struct bch_devs_mask *mask)
-{
- struct bch_dev *ca = NULL;
-
- while ((idx = mask
- ? find_next_bit(mask->d, c->sb.nr_devices, idx)
- : idx) < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[idx],
- lockdep_is_held(&c->state_lock))))
- idx++;
-
- return ca;
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
- const struct bch_devs_mask *mask)
-{
- return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
-}
-
-#define for_each_member_device_rcu(_c, _ca, _mask) \
- for (struct bch_dev *_ca = NULL; \
- (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
-
-#define for_each_online_member_rcu(_c, _ca) \
- for_each_member_device_rcu(_c, _ca, &(_c)->online_devs)
-
-#define for_each_rw_member_rcu(_c, _ca) \
- for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free])
-
-static inline void bch2_dev_get(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- BUG_ON(atomic_long_inc_return(&ca->ref) <= 1L);
-#else
- percpu_ref_get(&ca->ref);
-#endif
-}
-
-static inline void __bch2_dev_put(struct bch_dev *ca)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- long r = atomic_long_dec_return(&ca->ref);
- if (r < (long) !ca->dying)
- panic("bch_dev->ref underflow, last put: %pS\n", (void *) ca->last_put);
- ca->last_put = _THIS_IP_;
- if (!r)
- complete(&ca->ref_completion);
-#else
- percpu_ref_put(&ca->ref);
-#endif
-}
-
-static inline void bch2_dev_put(struct bch_dev *ca)
-{
- if (ca)
- __bch2_dev_put(ca);
-}
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
-{
- guard(rcu)();
- bch2_dev_put(ca);
- if ((ca = __bch2_next_dev(c, ca, NULL)))
- bch2_dev_get(ca);
- return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define __for_each_member_device(_c, _ca) \
- for (; (_ca = bch2_get_next_dev(_c, _ca));)
-
-#define for_each_member_device(_c, _ca) \
- for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_dev(_c, _ca));)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
- struct bch_dev *ca,
- unsigned state_mask,
- int rw, unsigned ref_idx)
-{
- guard(rcu)();
- if (ca)
- enumerated_ref_put(&ca->io_ref[rw], ref_idx);
-
- while ((ca = __bch2_next_dev(c, ca, NULL)) &&
- (!((1 << ca->mi.state) & state_mask) ||
- !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
- ;
-
- return ca;
-}
-
-#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \
- for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));)
-
-#define for_each_online_member(c, ca, ref_idx) \
- __for_each_online_member(c, ca, ~0, READ, ref_idx)
-
-#define for_each_rw_member(c, ca, ref_idx) \
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx)
-
-#define for_each_readable_member(c, ca, ref_idx) \
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx)
-
-static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
-{
- return dev < c->sb.nr_devices && c->devs[dev];
-}
-
-static inline bool bucket_valid(const struct bch_dev *ca, u64 b)
-{
- return b - ca->mi.first_bucket < ca->mi.nbuckets_minus_first;
-}
-
-static inline struct bch_dev *bch2_dev_have_ref(const struct bch_fs *c, unsigned dev)
-{
- EBUG_ON(!bch2_dev_exists(c, dev));
-
- return rcu_dereference_check(c->devs[dev], 1);
-}
-
-static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
-{
- EBUG_ON(!bch2_dev_exists(c, dev));
-
- return rcu_dereference_protected(c->devs[dev],
- lockdep_is_held(&c->sb_lock) ||
- lockdep_is_held(&c->state_lock));
-}
-
-static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
-{
- return c && dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[dev])
- : NULL;
-}
-
-int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_dev_missing_atomic(struct bch_fs *, unsigned);
-
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
-{
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
- if (unlikely(!ca))
- bch2_dev_missing_atomic(c, dev);
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
-{
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
- if (ca)
- bch2_dev_get(ca);
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
-{
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
- if (unlikely(!ca))
- bch2_dev_missing_atomic(c, dev);
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
-{
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
- if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
- bch2_dev_put(ca);
- ca = NULL;
- }
- return ca;
-}
-
-void bch2_dev_bucket_missing(struct bch_dev *, u64);
-
-static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
-{
- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
- if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
- bch2_dev_bucket_missing(ca, bucket.offset);
- bch2_dev_put(ca);
- ca = NULL;
- }
- return ca;
-}
-
-static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
- if (ca && ca->dev_idx == dev_idx)
- return ca;
- bch2_dev_put(ca);
- return bch2_dev_tryget_noerror(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx)
-{
- if (ca && ca->dev_idx == dev_idx)
- return ca;
- bch2_dev_put(ca);
- return bch2_dev_tryget(c, dev_idx);
-}
-
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
- int rw, unsigned ref_idx)
-{
- might_sleep();
-
- guard(rcu)();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
- return NULL;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
- return ca;
-
- enumerated_ref_put(&ca->io_ref[rw], ref_idx);
- return NULL;
-}
-
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
-extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
-
-static inline bool bch2_member_alive(struct bch_member *m)
-{
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
- !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
-}
-
-static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
-{
- if (dev < sb->nr_devices) {
- struct bch_member m = bch2_sb_member_get(sb, dev);
- return bch2_member_alive(&m);
- }
- return false;
-}
-
-unsigned bch2_sb_nr_devices(const struct bch_sb *);
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
- return (struct bch_member_cpu) {
- .nbuckets = le64_to_cpu(mi->nbuckets),
- .nbuckets_minus_first = le64_to_cpu(mi->nbuckets) -
- le16_to_cpu(mi->first_bucket),
- .first_bucket = le16_to_cpu(mi->first_bucket),
- .bucket_size = le16_to_cpu(mi->bucket_size),
- .group = BCH_MEMBER_GROUP(mi),
- .state = BCH_MEMBER_STATE(mi),
- .discard = BCH_MEMBER_DISCARD(mi),
- .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
- .durability = BCH_MEMBER_DURABILITY(mi)
- ? BCH_MEMBER_DURABILITY(mi) - 1
- : 1,
- .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi),
- .valid = bch2_member_alive(mi),
- .btree_bitmap_shift = mi->btree_bitmap_shift,
- .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
- };
-}
-
-void bch2_sb_members_from_cpu(struct bch_fs *);
-
-void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
-void bch2_dev_errors_reset(struct bch_dev *);
-
-static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
-{
- u64 end = start + sectors;
-
- if (end > 64ULL << ca->mi.btree_bitmap_shift)
- return false;
-
- for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
- (u64) bit << ca->mi.btree_bitmap_shift < end;
- bit++)
- if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
- return false;
- return true;
-}
-
-bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
-void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
-
-int bch2_sb_member_alloc(struct bch_fs *);
-void bch2_sb_members_clean_deleted(struct bch_fs *);
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
deleted file mode 100644
index fb72ad730518..000000000000
--- a/fs/bcachefs/sb-members_format.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
-#define _BCACHEFS_SB_MEMBERS_FORMAT_H
-
-/*
- * We refer to members with bitmasks in various places - but we need to get rid
- * of this limit:
- */
-#define BCH_SB_MEMBERS_MAX 64
-
-/*
- * Sentinal value - indicates a device that does not exist
- */
-#define BCH_SB_MEMBER_INVALID 255
-
-#define BCH_SB_MEMBER_DELETED_UUID \
- UUID_INIT(0xffffffff, 0xffff, 0xffff, \
- 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
-
-#define BCH_MIN_NR_NBUCKETS (1 << 6)
-
-#define BCH_IOPS_MEASUREMENTS() \
- x(seqread, 0) \
- x(seqwrite, 1) \
- x(randread, 2) \
- x(randwrite, 3)
-
-enum bch_iops_measurement {
-#define x(t, n) BCH_IOPS_##t = n,
- BCH_IOPS_MEASUREMENTS()
-#undef x
- BCH_IOPS_NR
-};
-
-#define BCH_MEMBER_ERROR_TYPES() \
- x(read, 0) \
- x(write, 1) \
- x(checksum, 2)
-
-enum bch_member_error_type {
-#define x(t, n) BCH_MEMBER_ERROR_##t = n,
- BCH_MEMBER_ERROR_TYPES()
-#undef x
- BCH_MEMBER_ERROR_NR
-};
-
-struct bch_member {
- __uuid_t uuid;
- __le64 nbuckets; /* device size */
- __le16 first_bucket; /* index of first bucket used */
- __le16 bucket_size; /* sectors */
- __u8 btree_bitmap_shift;
- __u8 pad[3];
- __le64 last_mount; /* time_t */
-
- __le64 flags;
- __le32 iops[4];
- __le64 errors[BCH_MEMBER_ERROR_NR];
- __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
- __le64 errors_reset_time;
- __le64 seq;
- __le64 btree_allocated_bitmap;
- /*
- * On recovery from a clean shutdown we don't normally read the journal,
- * but we still want to resume writing from where we left off so we
- * don't overwrite more than is necessary, for list journal debugging:
- */
- __le32 last_journal_bucket;
- __le32 last_journal_bucket_offset;
-};
-
-/*
- * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
- * 64 elements, so 64 - ilog2(64)
- */
-#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58
-
-/*
- * This limit comes from the bucket_gens array - it's a single allocation, and
- * kernel allocation are limited to INT_MAX
- */
-#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
-
-#define BCH_MEMBER_V1_BYTES 56
-
-LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16)
-LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
-/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
-LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
- struct bch_member, flags, 30, 31)
-LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT,
- struct bch_member, flags, 31, 32)
-
-#if 0
-LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
-LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
-#endif
-
-#define BCH_MEMBER_STATES() \
- x(rw, 0) \
- x(ro, 1) \
- x(failed, 2) \
- x(spare, 3)
-
-enum bch_member_state {
-#define x(t, n) BCH_MEMBER_STATE_##t = n,
- BCH_MEMBER_STATES()
-#undef x
- BCH_MEMBER_STATE_NR
-};
-
-struct bch_sb_field_members_v1 {
- struct bch_sb_field field;
- struct bch_member _members[]; //Members are now variable size
-};
-
-struct bch_sb_field_members_v2 {
- struct bch_sb_field field;
- __le16 member_bytes; //size of single member entry
- u8 pad[6];
- struct bch_member _members[];
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
deleted file mode 100644
index d6443e186872..000000000000
--- a/fs/bcachefs/sb-members_types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SB_MEMBERS_TYPES_H
-#define _BCACHEFS_SB_MEMBERS_TYPES_H
-
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u64 nbuckets_minus_first;
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u16 group;
- u8 state;
- u8 discard;
- u8 data_allowed;
- u8 durability;
- u8 freespace_initialized;
- u8 resize_on_mount;
- u8 valid;
- u8 btree_bitmap_shift;
- u64 btree_allocated_bitmap;
-};
-
-#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
deleted file mode 100644
index c4b3d8d3f414..000000000000
--- a/fs/bcachefs/seqmutex.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SEQMUTEX_H
-#define _BCACHEFS_SEQMUTEX_H
-
-#include <linux/mutex.h>
-
-struct seqmutex {
- struct mutex lock;
- u32 seq;
-};
-
-#define seqmutex_init(_lock) mutex_init(&(_lock)->lock)
-
-static inline bool seqmutex_trylock(struct seqmutex *lock)
-{
- return mutex_trylock(&lock->lock);
-}
-
-static inline void seqmutex_lock(struct seqmutex *lock)
-{
- mutex_lock(&lock->lock);
- lock->seq++;
-}
-
-static inline u32 seqmutex_unlock(struct seqmutex *lock)
-{
- u32 seq = lock->seq;
- mutex_unlock(&lock->lock);
- return seq;
-}
-
-static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
-{
- if (lock->seq != seq || !mutex_trylock(&lock->lock))
- return false;
-
- if (lock->seq != seq) {
- mutex_unlock(&lock->lock);
- return false;
- }
-
- return true;
-}
-
-#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
deleted file mode 100644
index a1cc44e66c7e..000000000000
--- a/fs/bcachefs/siphash.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: BSD-3-Clause
-/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
-
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
- * are the number of compression rounds and the number of finalization rounds.
- * A compression round is identical to a finalization round and this round
- * function is called SipRound. Given a 128-bit key k and a (possibly empty)
- * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
- *
- * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
- * by Jean-Philippe Aumasson and Daniel J. Bernstein,
- * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
- * https://131002.net/siphash/siphash.pdf
- * https://131002.net/siphash/
- */
-
-#include <asm/byteorder.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-
-#include "siphash.h"
-
-static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
- while (rounds--) {
- ctx->v[0] += ctx->v[1];
- ctx->v[2] += ctx->v[3];
- ctx->v[1] = rol64(ctx->v[1], 13);
- ctx->v[3] = rol64(ctx->v[3], 16);
-
- ctx->v[1] ^= ctx->v[0];
- ctx->v[3] ^= ctx->v[2];
- ctx->v[0] = rol64(ctx->v[0], 32);
-
- ctx->v[2] += ctx->v[1];
- ctx->v[0] += ctx->v[3];
- ctx->v[1] = rol64(ctx->v[1], 17);
- ctx->v[3] = rol64(ctx->v[3], 21);
-
- ctx->v[1] ^= ctx->v[2];
- ctx->v[3] ^= ctx->v[0];
- ctx->v[2] = rol64(ctx->v[2], 32);
- }
-}
-
-static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
-{
- u64 m = get_unaligned_le64(ptr);
-
- ctx->v[3] ^= m;
- SipHash_Rounds(ctx, rounds);
- ctx->v[0] ^= m;
-}
-
-void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
-{
- u64 k0, k1;
-
- k0 = le64_to_cpu(key->k0);
- k1 = le64_to_cpu(key->k1);
-
- ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
- ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
- ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
- ctx->v[3] = 0x7465646279746573ULL ^ k1;
-
- memset(ctx->buf, 0, sizeof(ctx->buf));
- ctx->bytes = 0;
-}
-
-void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
- const void *src, size_t len)
-{
- const u8 *ptr = src;
- size_t left, used;
-
- if (len == 0)
- return;
-
- used = ctx->bytes % sizeof(ctx->buf);
- ctx->bytes += len;
-
- if (used > 0) {
- left = sizeof(ctx->buf) - used;
-
- if (len >= left) {
- memcpy(&ctx->buf[used], ptr, left);
- SipHash_CRounds(ctx, ctx->buf, rc);
- len -= left;
- ptr += left;
- } else {
- memcpy(&ctx->buf[used], ptr, len);
- return;
- }
- }
-
- while (len >= sizeof(ctx->buf)) {
- SipHash_CRounds(ctx, ptr, rc);
- len -= sizeof(ctx->buf);
- ptr += sizeof(ctx->buf);
- }
-
- if (len > 0)
- memcpy(&ctx->buf[used], ptr, len);
-}
-
-void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
-{
- u64 r;
-
- r = SipHash_End(ctx, rc, rf);
-
- *((__le64 *) dst) = cpu_to_le64(r);
-}
-
-u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
-{
- u64 r;
- size_t left, used;
-
- used = ctx->bytes % sizeof(ctx->buf);
- left = sizeof(ctx->buf) - used;
- memset(&ctx->buf[used], 0, left - 1);
- ctx->buf[7] = ctx->bytes;
-
- SipHash_CRounds(ctx, ctx->buf, rc);
- ctx->v[2] ^= 0xff;
- SipHash_Rounds(ctx, rf);
-
- r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
- memset(ctx, 0, sizeof(*ctx));
- return r;
-}
-
-u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
-{
- SIPHASH_CTX ctx;
-
- SipHash_Init(&ctx, key);
- SipHash_Update(&ctx, rc, rf, src, len);
- return SipHash_End(&ctx, rc, rf);
-}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
deleted file mode 100644
index 3dfaf34a43b2..000000000000
--- a/fs/bcachefs/siphash.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause */
-/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
- * optimized for speed on short messages returning a 64bit hash/digest value.
- *
- * The number of rounds is defined during the initialization:
- * SipHash24_Init() for the fast and resonable strong version
- * SipHash48_Init() for the strong version (half as fast)
- *
- * struct SIPHASH_CTX ctx;
- * SipHash24_Init(&ctx);
- * SipHash_SetKey(&ctx, "16bytes long key");
- * SipHash_Update(&ctx, pointer_to_string, length_of_string);
- * SipHash_Final(output, &ctx);
- */
-
-#ifndef _SIPHASH_H_
-#define _SIPHASH_H_
-
-#include <linux/types.h>
-
-#define SIPHASH_BLOCK_LENGTH 8
-#define SIPHASH_KEY_LENGTH 16
-#define SIPHASH_DIGEST_LENGTH 8
-
-typedef struct _SIPHASH_CTX {
- u64 v[4];
- u8 buf[SIPHASH_BLOCK_LENGTH];
- u32 bytes;
-} SIPHASH_CTX;
-
-typedef struct {
- __le64 k0;
- __le64 k1;
-} SIPHASH_KEY;
-
-void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
-void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
-u64 SipHash_End(SIPHASH_CTX *, int, int);
-void SipHash_Final(void *, SIPHASH_CTX *, int, int);
-u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
-
-#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k))
-#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l))
-#define SipHash24_End(_d) SipHash_End((_d), 2, 4)
-#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4)
-#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l))
-
-#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k))
-#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l))
-#define SipHash48_End(_d) SipHash_End((_d), 4, 8)
-#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8)
-#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l))
-
-#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
deleted file mode 100644
index 538c324f4765..000000000000
--- a/fs/bcachefs/six.c
+++ /dev/null
@@ -1,878 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/export.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/task.h>
-#include <linux/slab.h>
-
-#include <trace/events/lock.h>
-
-#include "six.h"
-
-#ifdef DEBUG
-#define EBUG_ON(cond) BUG_ON(cond)
-#else
-#define EBUG_ON(cond) do {} while (0)
-#endif
-
-#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip)
-#define six_release(l, ip) lock_release(l, ip)
-
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-
-#define SIX_LOCK_HELD_read_OFFSET 0
-#define SIX_LOCK_HELD_read ~(~0U << 26)
-#define SIX_LOCK_HELD_intent (1U << 26)
-#define SIX_LOCK_HELD_write (1U << 27)
-#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write))
-#define SIX_LOCK_NOSPIN (1U << 31)
-
-struct six_lock_vals {
- /* Value we add to the lock in order to take the lock: */
- u32 lock_val;
-
- /* If the lock has this value (used as a mask), taking the lock fails: */
- u32 lock_fail;
-
- /* Mask that indicates lock is held for this type: */
- u32 held_mask;
-
- /* Waitlist we wakeup when releasing the lock: */
- enum six_lock_type unlock_wakeup;
-};
-
-static const struct six_lock_vals l[] = {
- [SIX_LOCK_read] = {
- .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET,
- .lock_fail = SIX_LOCK_HELD_write,
- .held_mask = SIX_LOCK_HELD_read,
- .unlock_wakeup = SIX_LOCK_write,
- },
- [SIX_LOCK_intent] = {
- .lock_val = SIX_LOCK_HELD_intent,
- .lock_fail = SIX_LOCK_HELD_intent,
- .held_mask = SIX_LOCK_HELD_intent,
- .unlock_wakeup = SIX_LOCK_intent,
- },
- [SIX_LOCK_write] = {
- .lock_val = SIX_LOCK_HELD_write,
- .lock_fail = SIX_LOCK_HELD_read,
- .held_mask = SIX_LOCK_HELD_write,
- .unlock_wakeup = SIX_LOCK_read,
- },
-};
-
-static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-{
- if ((atomic_read(&lock->state) & mask) != mask)
- atomic_or(mask, &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-{
- if (atomic_read(&lock->state) & mask)
- atomic_and(~mask, &lock->state);
-}
-
-static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
- u32 old, struct task_struct *owner)
-{
- if (type != SIX_LOCK_intent)
- return;
-
- if (!(old & SIX_LOCK_HELD_intent)) {
- EBUG_ON(lock->owner);
- lock->owner = owner;
- } else {
- EBUG_ON(lock->owner != current);
- }
-}
-
-static inline unsigned pcpu_read_count(struct six_lock *lock)
-{
- unsigned read_count = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- read_count += *per_cpu_ptr(lock->readers, cpu);
- return read_count;
-}
-
-/*
- * __do_six_trylock() - main trylock routine
- *
- * Returns 1 on success, 0 on failure
- *
- * In percpu reader mode, a failed trylock may cause a spurious trylock failure
- * for anoter thread taking the competing lock type, and we may havve to do a
- * wakeup: when a wakeup is required, we return -1 - wakeup_type.
- */
-static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
- struct task_struct *task, bool try)
-{
- int ret;
- u32 old;
-
- EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
- EBUG_ON(type == SIX_LOCK_write &&
- (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-
- /*
- * Percpu reader mode:
- *
- * The basic idea behind this algorithm is that you can implement a lock
- * between two threads without any atomics, just memory barriers:
- *
- * For two threads you'll need two variables, one variable for "thread a
- * has the lock" and another for "thread b has the lock".
- *
- * To take the lock, a thread sets its variable indicating that it holds
- * the lock, then issues a full memory barrier, then reads from the
- * other thread's variable to check if the other thread thinks it has
- * the lock. If we raced, we backoff and retry/sleep.
- *
- * Failure to take the lock may cause a spurious trylock failure in
- * another thread, because we temporarily set the lock to indicate that
- * we held it. This would be a problem for a thread in six_lock(), when
- * they are calling trylock after adding themself to the waitlist and
- * prior to sleeping.
- *
- * Therefore, if we fail to get the lock, and there were waiters of the
- * type we conflict with, we will have to issue a wakeup.
- *
- * Since we may be called under wait_lock (and by the wakeup code
- * itself), we return that the wakeup has to be done instead of doing it
- * here.
- */
- if (type == SIX_LOCK_read && lock->readers) {
- preempt_disable();
- this_cpu_inc(*lock->readers); /* signal that we own lock */
-
- smp_mb();
-
- old = atomic_read(&lock->state);
- ret = !(old & l[type].lock_fail);
-
- this_cpu_sub(*lock->readers, !ret);
- preempt_enable();
-
- if (!ret) {
- smp_mb();
- if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
- ret = -1 - SIX_LOCK_write;
- }
- } else if (type == SIX_LOCK_write && lock->readers) {
- if (try)
- atomic_add(SIX_LOCK_HELD_write, &lock->state);
-
- /*
- * Make sure atomic_add happens before pcpu_read_count and
- * six_set_bitmask in slow path happens before pcpu_read_count.
- *
- * Paired with the smp_mb() in read lock fast path (per-cpu mode)
- * and the one before atomic_read in read unlock path.
- */
- smp_mb();
- ret = !pcpu_read_count(lock);
-
- if (try && !ret) {
- old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
- if (old & SIX_LOCK_WAITING_read)
- ret = -1 - SIX_LOCK_read;
- }
- } else {
- old = atomic_read(&lock->state);
- do {
- ret = !(old & l[type].lock_fail);
- if (!ret || (type == SIX_LOCK_write && !try)) {
- smp_mb();
- break;
- }
- } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-
- EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
- }
-
- if (ret > 0)
- six_set_owner(lock, type, old, task);
-
- EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
- (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-
- return ret;
-}
-
-static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-{
- struct six_lock_waiter *w, *next;
- struct task_struct *task;
- bool saw_one;
- int ret;
-again:
- ret = 0;
- saw_one = false;
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, &lock->wait_list, list) {
- if (w->lock_want != lock_type)
- continue;
-
- if (saw_one && lock_type != SIX_LOCK_read)
- goto unlock;
- saw_one = true;
-
- ret = __do_six_trylock(lock, lock_type, w->task, false);
- if (ret <= 0)
- goto unlock;
-
- /*
- * Similar to percpu_rwsem_wake_function(), we need to guard
- * against the wakee noticing w->lock_acquired, returning, and
- * then exiting before we do the wakeup:
- */
- task = get_task_struct(w->task);
- __list_del(w->list.prev, w->list.next);
- /*
- * The release barrier here ensures the ordering of the
- * __list_del before setting w->lock_acquired; @w is on the
- * stack of the thread doing the waiting and will be reused
- * after it sees w->lock_acquired with no other locking:
- * pairs with smp_load_acquire() in six_lock_slowpath()
- */
- smp_store_release(&w->lock_acquired, true);
- wake_up_process(task);
- put_task_struct(task);
- }
-
- six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-unlock:
- raw_spin_unlock(&lock->wait_lock);
-
- if (ret < 0) {
- lock_type = -ret - 1;
- goto again;
- }
-}
-
-__always_inline
-static void six_lock_wakeup(struct six_lock *lock, u32 state,
- enum six_lock_type lock_type)
-{
- if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
- return;
-
- if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
- return;
-
- __six_lock_wakeup(lock, lock_type);
-}
-
-__always_inline
-static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-{
- int ret;
-
- ret = __do_six_trylock(lock, type, current, try);
- if (ret < 0)
- __six_lock_wakeup(lock, -ret - 1);
-
- return ret > 0;
-}
-
-/**
- * six_trylock_ip - attempt to take a six lock without blocking
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
- if (!do_six_trylock(lock, type, true))
- return false;
-
- if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
- return true;
-}
-EXPORT_SYMBOL_GPL(six_trylock_ip);
-
-/**
- * six_relock_ip - attempt to re-take a lock that was held previously
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq: lock sequence number obtained from six_lock_seq() while lock was
- * held previously
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
- unsigned seq, unsigned long ip)
-{
- if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
- return false;
-
- if (six_lock_seq(lock) != seq) {
- six_unlock_ip(lock, type, ip);
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(six_relock_ip);
-
-#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-
-static inline bool six_owner_running(struct six_lock *lock)
-{
- /*
- * When there's no owner, we might have preempted between the owner
- * acquiring the lock and setting the owner field. If we're an RT task
- * that will live-lock because we won't let the owner complete.
- */
- guard(rcu)();
- struct task_struct *owner = READ_ONCE(lock->owner);
- return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
- struct six_lock_waiter *wait,
- enum six_lock_type type)
-{
- unsigned loop = 0;
- u64 end_time;
-
- if (type == SIX_LOCK_write)
- return false;
-
- if (lock->wait_list.next != &wait->list)
- return false;
-
- if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
- return false;
-
- preempt_disable();
- end_time = sched_clock() + 10 * NSEC_PER_USEC;
-
- while (!need_resched() && six_owner_running(lock)) {
- /*
- * Ensures that writes to the waitlist entry happen after we see
- * wait->lock_acquired: pairs with the smp_store_release in
- * __six_lock_wakeup
- */
- if (smp_load_acquire(&wait->lock_acquired)) {
- preempt_enable();
- return true;
- }
-
- if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
- six_set_bitmask(lock, SIX_LOCK_NOSPIN);
- break;
- }
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
-
- preempt_enable();
- return false;
-}
-
-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock,
- struct six_lock_waiter *wait,
- enum six_lock_type type)
-{
- return false;
-}
-
-#endif
-
-noinline
-static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- int ret = 0;
-
- if (type == SIX_LOCK_write) {
- EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
- atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
-
- trace_contention_begin(lock, 0);
- lock_contended(&lock->dep_map, ip);
-
- wait->task = current;
- wait->lock_want = type;
- wait->lock_acquired = false;
-
- raw_spin_lock(&lock->wait_lock);
- six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
- /*
- * Retry taking the lock after taking waitlist lock, in case we raced
- * with an unlock:
- */
- ret = __do_six_trylock(lock, type, current, false);
- if (ret <= 0) {
- wait->start_time = local_clock();
-
- if (!list_empty(&lock->wait_list)) {
- struct six_lock_waiter *last =
- list_last_entry(&lock->wait_list,
- struct six_lock_waiter, list);
-
- if (time_before_eq64(wait->start_time, last->start_time))
- wait->start_time = last->start_time + 1;
- }
-
- list_add_tail(&wait->list, &lock->wait_list);
- }
- raw_spin_unlock(&lock->wait_lock);
-
- if (unlikely(ret > 0)) {
- ret = 0;
- goto out;
- }
-
- if (unlikely(ret < 0)) {
- __six_lock_wakeup(lock, -ret - 1);
- ret = 0;
- }
-
- if (six_optimistic_spin(lock, wait, type))
- goto out;
-
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- /*
- * Ensures that writes to the waitlist entry happen after we see
- * wait->lock_acquired: pairs with the smp_store_release in
- * __six_lock_wakeup
- */
- if (smp_load_acquire(&wait->lock_acquired))
- break;
-
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (unlikely(ret)) {
- bool acquired;
-
- /*
- * If should_sleep_fn() returns an error, we are
- * required to return that error even if we already
- * acquired the lock - should_sleep_fn() might have
- * modified external state (e.g. when the deadlock cycle
- * detector in bcachefs issued a transaction restart)
- */
- raw_spin_lock(&lock->wait_lock);
- acquired = wait->lock_acquired;
- if (!acquired)
- list_del(&wait->list);
- raw_spin_unlock(&lock->wait_lock);
-
- if (unlikely(acquired)) {
- do_six_unlock_type(lock, type);
- } else if (type == SIX_LOCK_write) {
- six_clear_bitmask(lock, SIX_LOCK_HELD_write);
- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
- }
- break;
- }
-
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-out:
- trace_contention_end(lock, 0);
-
- return ret;
-}
-
-/**
- * six_lock_ip_waiter - take a lock, with full waitlist interface
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait: pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * This is the most general six_lock() variant, with parameters to support full
- * cycle detection for deadlock avoidance.
- *
- * The code calling this function must implement tracking of held locks, and the
- * @wait object should be embedded into the struct that tracks held locks -
- * which must also be accessible in a thread-safe way.
- *
- * @should_sleep_fn should invoke the cycle detector; it should walk each
- * lock's waiters, and for each waiter recursively walk their held locks.
- *
- * When this function must block, @wait will be added to @lock's waitlist before
- * calling trylock, and before calling @should_sleep_fn, and @wait will not be
- * removed from the lock waitlist until the lock has been successfully acquired,
- * or we abort.
- *
- * @wait.start_time will be monotonically increasing for any given waitlist, and
- * thus may be used as a loop cursor.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- int ret;
-
- wait->start_time = 0;
-
- if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-
- ret = do_six_trylock(lock, type, true) ? 0
- : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-
- if (ret && type != SIX_LOCK_write)
- six_release(&lock->dep_map, ip);
- if (!ret)
- lock_acquired(&lock->dep_map, ip);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-
-__always_inline
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- u32 state;
-
- if (type == SIX_LOCK_intent)
- lock->owner = NULL;
-
- if (type == SIX_LOCK_read &&
- lock->readers) {
- smp_mb(); /* unlock barrier */
- this_cpu_dec(*lock->readers);
- smp_mb(); /* between unlocking and checking for waiters */
- state = atomic_read(&lock->state);
- } else {
- u32 v = l[type].lock_val;
-
- if (type != SIX_LOCK_read)
- v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-
- EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
- state = atomic_sub_return_release(v, &lock->state);
- }
-
- six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-/**
- * six_unlock_ip - drop a six lock
- * @lock: lock to unlock
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock); read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
- */
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
- EBUG_ON(type == SIX_LOCK_write &&
- !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
- EBUG_ON((type == SIX_LOCK_write ||
- type == SIX_LOCK_intent) &&
- lock->owner != current);
-
- if (type != SIX_LOCK_write)
- six_release(&lock->dep_map, ip);
-
- if (type == SIX_LOCK_intent &&
- lock->intent_lock_recurse) {
- --lock->intent_lock_recurse;
- return;
- }
-
- if (type == SIX_LOCK_write &&
- lock->write_lock_recurse) {
- --lock->write_lock_recurse;
- return;
- }
-
- if (type == SIX_LOCK_write)
- lock->seq++;
-
- do_six_unlock_type(lock, type);
-}
-EXPORT_SYMBOL_GPL(six_unlock_ip);
-
-/**
- * six_lock_downgrade - convert an intent lock to a read lock
- * @lock: lock to dowgrade
- *
- * @lock will have read count incremented and intent count decremented
- */
-void six_lock_downgrade(struct six_lock *lock)
-{
- six_lock_increment(lock, SIX_LOCK_read);
- six_unlock_intent(lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_downgrade);
-
-/**
- * six_lock_tryupgrade - attempt to convert read lock to an intent lock
- * @lock: lock to upgrade
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_lock_tryupgrade(struct six_lock *lock)
-{
- u32 old = atomic_read(&lock->state), new;
-
- do {
- new = old;
-
- if (new & SIX_LOCK_HELD_intent)
- return false;
-
- if (!lock->readers) {
- EBUG_ON(!(new & SIX_LOCK_HELD_read));
- new -= l[SIX_LOCK_read].lock_val;
- }
-
- new |= SIX_LOCK_HELD_intent;
- } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-
- if (lock->readers)
- this_cpu_dec(*lock->readers);
-
- six_set_owner(lock, SIX_LOCK_intent, old, current);
-
- return true;
-}
-EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-
-/**
- * six_trylock_convert - attempt to convert a held lock from one type to another
- * @lock: lock to upgrade
- * @from: SIX_LOCK_read or SIX_LOCK_intent
- * @to: SIX_LOCK_read or SIX_LOCK_intent
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_trylock_convert(struct six_lock *lock,
- enum six_lock_type from,
- enum six_lock_type to)
-{
- EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-
- if (to == from)
- return true;
-
- if (to == SIX_LOCK_read) {
- six_lock_downgrade(lock);
- return true;
- } else {
- return six_lock_tryupgrade(lock);
- }
-}
-EXPORT_SYMBOL_GPL(six_trylock_convert);
-
-/**
- * six_lock_increment - increase held lock count on a lock that is already held
- * @lock: lock to increment
- * @type: SIX_LOCK_read or SIX_LOCK_intent
- *
- * @lock must already be held, with a lock type that is greater than or equal to
- * @type
- *
- * A corresponding six_unlock_type() call will be required for @lock to be fully
- * unlocked.
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
- six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-
- /* XXX: assert already locked, and that we don't overflow: */
-
- switch (type) {
- case SIX_LOCK_read:
- if (lock->readers) {
- this_cpu_inc(*lock->readers);
- } else {
- EBUG_ON(!(atomic_read(&lock->state) &
- (SIX_LOCK_HELD_read|
- SIX_LOCK_HELD_intent)));
- atomic_add(l[type].lock_val, &lock->state);
- }
- break;
- case SIX_LOCK_write:
- lock->write_lock_recurse++;
- fallthrough;
- case SIX_LOCK_intent:
- EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
- lock->intent_lock_recurse++;
- break;
- }
-}
-EXPORT_SYMBOL_GPL(six_lock_increment);
-
-/**
- * six_lock_wakeup_all - wake up all waiters on @lock
- * @lock: lock to wake up waiters for
- *
- * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
- * abort the lock operation.
- *
- * This function is never needed in a bug-free program; it's only useful in
- * debug code, e.g. to determine if a cycle detector is at fault.
- */
-void six_lock_wakeup_all(struct six_lock *lock)
-{
- u32 state = atomic_read(&lock->state);
- struct six_lock_waiter *w;
-
- six_lock_wakeup(lock, state, SIX_LOCK_read);
- six_lock_wakeup(lock, state, SIX_LOCK_intent);
- six_lock_wakeup(lock, state, SIX_LOCK_write);
-
- raw_spin_lock(&lock->wait_lock);
- list_for_each_entry(w, &lock->wait_list, list)
- wake_up_process(w->task);
- raw_spin_unlock(&lock->wait_lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-
-/**
- * six_lock_counts - return held lock counts, for each lock type
- * @lock: lock to return counters for
- *
- * Return: the number of times a lock is held for read, intent and write.
- */
-struct six_lock_count six_lock_counts(struct six_lock *lock)
-{
- struct six_lock_count ret;
-
- ret.n[SIX_LOCK_read] = !lock->readers
- ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
- : pcpu_read_count(lock);
- ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
- lock->intent_lock_recurse;
- ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_counts);
-
-/**
- * six_lock_readers_add - directly manipulate reader count of a lock
- * @lock: lock to add/subtract readers for
- * @nr: reader count to add/subtract
- *
- * When an upper layer is implementing lock reentrency, we may have both read
- * and intent locks on the same lock.
- *
- * When we need to take a write lock, the read locks will cause self-deadlock,
- * because six locks themselves do not track which read locks are held by the
- * current thread and which are held by a different thread - it does no
- * per-thread tracking of held locks.
- *
- * The upper layer that is tracking held locks may however, if trylock() has
- * failed, count up its own read locks, subtract them, take the write lock, and
- * then re-add them.
- *
- * As in any other situation when taking a write lock, @lock must be held for
- * intent one (or more) times, so @lock will never be left unlocked.
- */
-void six_lock_readers_add(struct six_lock *lock, int nr)
-{
- if (lock->readers) {
- this_cpu_add(*lock->readers, nr);
- } else {
- EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
- /* reader count starts at bit 0 */
- atomic_add(nr, &lock->state);
- }
-}
-EXPORT_SYMBOL_GPL(six_lock_readers_add);
-
-/**
- * six_lock_exit - release resources held by a lock prior to freeing
- * @lock: lock to exit
- *
- * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
- * required to free the percpu read counts.
- */
-void six_lock_exit(struct six_lock *lock)
-{
- WARN_ON(lock->readers && pcpu_read_count(lock));
- WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-
- free_percpu(lock->readers);
- lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_exit);
-
-void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags,
- gfp_t gfp)
-{
- atomic_set(&lock->state, 0);
- raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- debug_check_no_locks_freed((void *) lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-
- /*
- * Don't assume that we have real percpu variables available in
- * userspace:
- */
-#ifdef __KERNEL__
- if (flags & SIX_LOCK_INIT_PCPU) {
- /*
- * We don't return an error here on memory allocation failure
- * since percpu is an optimization, and locks will work with the
- * same semantics in non-percpu mode: callers can check for
- * failure if they wish by checking lock->readers, but generally
- * will not want to treat it as an error.
- */
- lock->readers = alloc_percpu_gfp(unsigned, gfp);
- }
-#endif
-}
-EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
deleted file mode 100644
index 59b851cf8bac..000000000000
--- a/fs/bcachefs/six.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_SIX_H
-#define _LINUX_SIX_H
-
-/**
- * DOC: SIX locks overview
- *
- * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
- * but with an additional state: read/shared, intent, exclusive/write
- *
- * The purpose of the intent state is to allow for greater concurrency on tree
- * structures without deadlocking. In general, a read can't be upgraded to a
- * write lock without deadlocking, so an operation that updates multiple nodes
- * will have to take write locks for the full duration of the operation.
- *
- * But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at the start of the operation,
- * and then take write locks only for the actual update to each individual
- * nodes, without deadlocking.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
- *
- * An intent lock must be held before taking a write lock:
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
- *
- * Other operations:
- * six_trylock_read()
- * six_trylock_intent()
- * six_trylock_write()
- *
- * six_lock_downgrade() convert from intent to read
- * six_lock_tryupgrade() attempt to convert from read to intent, may fail
- *
- * There are also interfaces that take the lock type as an enum:
- *
- * six_lock_type(&foo->lock, SIX_LOCK_read);
- * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
- * six_lock_type(&foo->lock, SIX_LOCK_write);
- * six_unlock_type(&foo->lock, SIX_LOCK_write);
- * six_unlock_type(&foo->lock, SIX_LOCK_intent);
- *
- * Lock sequence numbers - unlock(), relock():
- *
- * Locks embed sequences numbers, which are incremented on write lock/unlock.
- * This allows locks to be dropped and the retaken iff the state they protect
- * hasn't changed; this makes it much easier to avoid holding locks while e.g.
- * doing IO or allocating memory.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * u32 seq = six_lock_seq(&foo->lock);
- * six_unlock_read(&foo->lock);
- *
- * some_operation_that_may_block();
- *
- * if (six_relock_read(&foo->lock, seq)) { ... }
- *
- * If the relock operation succeeds, it is as if the lock was never unlocked.
- *
- * Reentrancy:
- *
- * Six locks are not by themselves reentrant, but have counters for both the
- * read and intent states that can be used to provide reentrancy by an upper
- * layer that tracks held locks. If a lock is known to already be held in the
- * read or intent state, six_lock_increment() can be used to bump the "lock
- * held in this state" counter, increasing the number of unlock calls that
- * will be required to fully unlock it.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * six_lock_increment(&foo->lock, SIX_LOCK_read);
- * six_unlock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
- * foo->lock is now fully unlocked.
- *
- * Since the intent state supercedes read, it's legal to increment the read
- * counter when holding an intent lock, but not the reverse.
- *
- * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
- * is not legal.
- *
- * should_sleep_fn:
- *
- * There is a six_lock() variant that takes a function pointer that is called
- * immediately prior to schedule() when blocking, and may return an error to
- * abort.
- *
- * One possible use for this feature is when objects being locked are part of
- * a cache and may reused, and lock ordering is based on a property of the
- * object that will change when the object is reused - i.e. logical key order.
- *
- * If looking up an object in the cache may race with object reuse, and lock
- * ordering is required to prevent deadlock, object reuse may change the
- * correct lock order for that object and cause a deadlock. should_sleep_fn
- * can be used to check if the object is still the object we want and avoid
- * this deadlock.
- *
- * Wait list entry interface:
- *
- * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
- * wait list entry. By embedding six_lock_waiter into another object, and by
- * traversing lock waitlists, it is then possible for an upper layer to
- * implement full cycle detection for deadlock avoidance.
- *
- * should_sleep_fn should be used for invoking the cycle detector, walking the
- * graph of held locks to check for a deadlock. The upper layer must track
- * held locks for each thread, and each thread's held locks must be reachable
- * from its six_lock_waiter object.
- *
- * six_lock_waiter() will add the wait object to the waitlist re-trying taking
- * the lock, and before calling should_sleep_fn, and the wait object will not
- * be removed from the waitlist until either the lock has been successfully
- * acquired, or we aborted because should_sleep_fn returned an error.
- *
- * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
- * have timestamps in strictly ascending order - this is so the timestamp can
- * be used as a cursor for lock graph traverse.
- */
-
-#include <linux/lockdep.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-enum six_lock_type {
- SIX_LOCK_read,
- SIX_LOCK_intent,
- SIX_LOCK_write,
-};
-
-struct six_lock {
- atomic_t state;
- u32 seq;
- unsigned intent_lock_recurse;
- unsigned write_lock_recurse;
- struct task_struct *owner;
- unsigned __percpu *readers;
- raw_spinlock_t wait_lock;
- struct list_head wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-};
-
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
- enum six_lock_type lock_want;
- bool lock_acquired;
- u64 start_time;
-};
-
-typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-
-void six_lock_exit(struct six_lock *lock);
-
-enum six_lock_init_flags {
- SIX_LOCK_INIT_PCPU = 1U << 0,
-};
-
-void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags,
- gfp_t gfp);
-
-/**
- * six_lock_init - initialize a six lock
- * @lock: lock to initialize
- * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
- */
-#define six_lock_init(lock, flags, gfp) \
-do { \
- static struct lock_class_key __key; \
- \
- __six_lock_init((lock), #lock, &__key, flags, gfp); \
-} while (0)
-
-/**
- * six_lock_seq - obtain current lock sequence number
- * @lock: six_lock to obtain sequence number for
- *
- * @lock should be held for read or intent, and not write
- *
- * By saving the lock sequence number, we can unlock @lock and then (typically
- * after some blocking operation) attempt to relock it: the relock will succeed
- * if the sequence number hasn't changed, meaning no write locks have been taken
- * and state corresponding to what @lock protects is still valid.
- */
-static inline u32 six_lock_seq(const struct six_lock *lock)
-{
- return lock->seq;
-}
-
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_trylock_type - attempt to take a six lock without blocking
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
- return six_trylock_ip(lock, type, _THIS_IP_);
-}
-
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip);
-
-/**
- * six_lock_waiter - take a lock, with full waitlist interface
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait: pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- *
- * This is a convenience wrapper around six_lock_ip_waiter(), see that function
- * for full documentation.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-/**
- * six_lock_ip - take a six lock lock
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- struct six_lock_waiter wait;
-
- return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-/**
- * six_lock_type - take a six lock lock
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- struct six_lock_waiter wait;
-
- return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
- unsigned seq, unsigned long ip);
-
-/**
- * six_relock_type - attempt to re-take a lock that was held previously
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq: lock sequence number obtained from six_lock_seq() while lock was
- * held previously
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
- unsigned seq)
-{
- return six_relock_ip(lock, type, seq, _THIS_IP_);
-}
-
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_unlock_type - drop a six lock
- * @lock: lock to unlock
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock); read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
- */
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- six_unlock_ip(lock, type, _THIS_IP_);
-}
-
-#define __SIX_LOCK(type) \
-static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-{ \
- return six_trylock_ip(lock, SIX_LOCK_##type, ip); \
-} \
- \
-static inline bool six_trylock_##type(struct six_lock *lock) \
-{ \
- return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
-} \
- \
-static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \
- struct six_lock_waiter *wait, \
- six_lock_should_sleep_fn should_sleep_fn, void *p,\
- unsigned long ip) \
-{ \
- return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-} \
- \
-static inline int six_lock_ip_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn should_sleep_fn, void *p, \
- unsigned long ip) \
-{ \
- return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-} \
- \
-static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{ \
- return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \
-} \
- \
-static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \
-{ \
- return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \
-} \
- \
-static inline int six_lock_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn fn, void *p)\
-{ \
- return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \
-} \
- \
-static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \
-{ \
- six_unlock_ip(lock, SIX_LOCK_##type, ip); \
-} \
- \
-static inline void six_unlock_##type(struct six_lock *lock) \
-{ \
- six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
-
-void six_lock_downgrade(struct six_lock *);
-bool six_lock_tryupgrade(struct six_lock *);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
- enum six_lock_type);
-
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-
-void six_lock_wakeup_all(struct six_lock *);
-
-struct six_lock_count {
- unsigned n[3];
-};
-
-struct six_lock_count six_lock_counts(struct six_lock *);
-void six_lock_readers_add(struct six_lock *, int);
-
-#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
deleted file mode 100644
index 4c43d2a2c1f5..000000000000
--- a/fs/bcachefs/snapshot.c
+++ /dev/null
@@ -1,2043 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "bbpos.h"
-#include "bkey_buf.h"
-#include "btree_cache.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-
-#include <linux/random.h>
-
-/*
- * Snapshot trees:
- *
- * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
- * exist to provide a stable identifier for the whole lifetime of a snapshot
- * tree.
- */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
- prt_printf(out, "subvol %u root snapshot %u",
- le32_to_cpu(t.v->master_subvol),
- le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- int ret = 0;
-
- bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)),
- c, snapshot_tree_pos_bad,
- "bad pos");
-fsck_err:
- return ret;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot_tree *s)
-{
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
- BTREE_ITER_with_updates, snapshot_tree, s);
-
- if (bch2_err_matches(ret, ENOENT))
- ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
- return ret;
-}
-
-struct bkey_i_snapshot_tree *
-__bch2_snapshot_tree_create(struct btree_trans *trans)
-{
- struct btree_iter iter;
- int ret = bch2_bkey_get_empty_slot(trans, &iter,
- BTREE_ID_snapshot_trees, POS(0, U32_MAX));
- struct bkey_i_snapshot_tree *s_t;
-
- if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
- if (ret)
- return ERR_PTR(ret);
-
- s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(s_t);
- bch2_trans_iter_exit(trans, &iter);
- return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int bch2_snapshot_tree_create(struct btree_trans *trans,
- u32 root_id, u32 subvol_id, u32 *tree_id)
-{
- struct bkey_i_snapshot_tree *n_tree =
- __bch2_snapshot_tree_create(trans);
-
- if (IS_ERR(n_tree))
- return PTR_ERR(n_tree);
-
- n_tree->v.master_subvol = cpu_to_le32(subvol_id);
- n_tree->v.root_snapshot = cpu_to_le32(root_id);
- *tree_id = n_tree->k.p.offset;
- return 0;
-}
-
-/* Snapshot nodes: */
-
-static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- while (id && id < ancestor) {
- const struct snapshot_t *s = __snapshot_t(t, id);
- id = s ? s->parent : 0;
- }
- return id == ancestor;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
- guard(rcu)();
- return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
-}
-
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- const struct snapshot_t *s = __snapshot_t(t, id);
- if (!s)
- return 0;
-
- if (s->skip[2] <= ancestor)
- return s->skip[2];
- if (s->skip[1] <= ancestor)
- return s->skip[1];
- if (s->skip[0] <= ancestor)
- return s->skip[0];
- return s->parent;
-}
-
-static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- const struct snapshot_t *s = __snapshot_t(t, id);
- if (!s)
- return false;
-
- return test_bit(ancestor - id - 1, s->is_ancestor);
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 orig_id = id;
-#endif
-
- guard(rcu)();
- struct snapshot_table *t = rcu_dereference(c->snapshots);
-
- if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
- return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
-
- if (likely(ancestor >= IS_ANCESTOR_BITMAP))
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
- id = get_ancestor_below(t, id, ancestor);
-
- bool ret = id && id < ancestor
- ? test_ancestor_bitmap(t, id, ancestor)
- : id == ancestor;
-
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
- return ret;
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
- size_t idx = U32_MAX - id;
- struct snapshot_table *new, *old;
-
- size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
- size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
-
- if (unlikely(new_bytes > INT_MAX))
- return NULL;
-
- new = kvzalloc(new_bytes, GFP_KERNEL);
- if (!new)
- return NULL;
-
- new->nr = new_size;
-
- old = rcu_dereference_protected(c->snapshots, true);
- if (old)
- memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
-
- rcu_assign_pointer(c->snapshots, new);
- kvfree_rcu(old, rcu);
-
- return &rcu_dereference_protected(c->snapshots,
- lockdep_is_held(&c->snapshot_table_lock))->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
- size_t idx = U32_MAX - id;
- struct snapshot_table *table =
- rcu_dereference_protected(c->snapshots,
- lockdep_is_held(&c->snapshot_table_lock));
-
- lockdep_assert_held(&c->snapshot_table_lock);
-
- if (likely(table && idx < table->nr))
- return &table->s[idx];
-
- return __snapshot_t_mut(c, id);
-}
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
- if (BCH_SNAPSHOT_SUBVOL(s.v))
- prt_str(out, "subvol ");
- if (BCH_SNAPSHOT_WILL_DELETE(s.v))
- prt_str(out, "will_delete ");
- if (BCH_SNAPSHOT_DELETED(s.v))
- prt_str(out, "deleted ");
-
- prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
- le32_to_cpu(s.v->parent),
- le32_to_cpu(s.v->children[0]),
- le32_to_cpu(s.v->children[1]),
- le32_to_cpu(s.v->subvol),
- le32_to_cpu(s.v->tree));
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
- prt_printf(out, " depth %u skiplist %u %u %u",
- le32_to_cpu(s.v->depth),
- le32_to_cpu(s.v->skip[0]),
- le32_to_cpu(s.v->skip[1]),
- le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_snapshot s;
- u32 i, id;
- int ret = 0;
-
- bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)),
- c, snapshot_pos_bad,
- "bad pos");
-
- s = bkey_s_c_to_snapshot(k);
-
- id = le32_to_cpu(s.v->parent);
- bkey_fsck_err_on(id && id <= k.k->p.offset,
- c, snapshot_parent_bad,
- "bad parent node (%u <= %llu)",
- id, k.k->p.offset);
-
- bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
- c, snapshot_children_not_normalized,
- "children not normalized");
-
- bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
- c, snapshot_child_duplicate,
- "duplicate child nodes");
-
- for (i = 0; i < 2; i++) {
- id = le32_to_cpu(s.v->children[i]);
-
- bkey_fsck_err_on(id >= k.k->p.offset,
- c, snapshot_child_bad,
- "bad child node (%u >= %llu)",
- id, k.k->p.offset);
- }
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
- bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
- le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
- c, snapshot_skiplist_not_normalized,
- "skiplist not normalized");
-
- for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
- id = le32_to_cpu(s.v->skip[i]);
-
- bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
- c, snapshot_skiplist_bad,
- "bad skiplist node %u", id);
- }
- }
-fsck_err:
- return ret;
-}
-
-static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
-{
- mutex_lock(&c->snapshot_table_lock);
- int ret = snapshot_t_mut(c, id)
- ? 0
- : bch_err_throw(c, ENOMEM_mark_snapshot);
- mutex_unlock(&c->snapshot_table_lock);
- return ret;
-}
-
-static int __bch2_mark_snapshot(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_t *t;
- u32 id = new.k->p.offset;
- int ret = 0;
-
- mutex_lock(&c->snapshot_table_lock);
-
- t = snapshot_t_mut(c, id);
- if (!t) {
- ret = bch_err_throw(c, ENOMEM_mark_snapshot);
- goto err;
- }
-
- if (new.k->type == KEY_TYPE_snapshot) {
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-
- t->state = !BCH_SNAPSHOT_DELETED(s.v)
- ? SNAPSHOT_ID_live
- : SNAPSHOT_ID_deleted;
- t->parent = le32_to_cpu(s.v->parent);
- t->children[0] = le32_to_cpu(s.v->children[0]);
- t->children[1] = le32_to_cpu(s.v->children[1]);
- t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
- t->tree = le32_to_cpu(s.v->tree);
-
- if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
- t->depth = le32_to_cpu(s.v->depth);
- t->skip[0] = le32_to_cpu(s.v->skip[0]);
- t->skip[1] = le32_to_cpu(s.v->skip[1]);
- t->skip[2] = le32_to_cpu(s.v->skip[2]);
- } else {
- t->depth = 0;
- t->skip[0] = 0;
- t->skip[1] = 0;
- t->skip[2] = 0;
- }
-
- u32 parent = id;
-
- while ((parent = bch2_snapshot_parent_early(c, parent)) &&
- parent - id - 1 < IS_ANCESTOR_BITMAP)
- __set_bit(parent - id - 1, t->is_ancestor);
-
- if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
- bch2_delete_dead_snapshots_async(c);
- }
- } else {
- memset(t, 0, sizeof(*t));
- }
-err:
- mutex_unlock(&c->snapshot_table_lock);
- return ret;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot *s)
-{
- return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_with_updates, snapshot, s);
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
- return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
- u32 n, parent;
-
- n = bch2_snapshot_left_child(c, id);
- if (n)
- return n;
-
- while ((parent = bch2_snapshot_parent(c, id))) {
- n = bch2_snapshot_right_child(c, parent);
- if (n && n != id)
- return n;
- id = parent;
- }
-
- return 0;
-}
-
-u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
- snapshot_id_list *skip)
-{
- guard(rcu)();
- u32 id, subvol = 0, s;
-retry:
- id = snapshot_root;
- while (id && bch2_snapshot_exists(c, id)) {
- if (!(skip && snapshot_list_has_id(skip, id))) {
- s = snapshot_t(c, id)->subvol;
-
- if (s && (!subvol || s < subvol))
- subvol = s;
- }
- id = bch2_snapshot_tree_next(c, id);
- if (id == snapshot_root)
- break;
- }
-
- if (!subvol && skip) {
- skip = NULL;
- goto retry;
- }
-
- return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
- u32 snapshot_root, u32 *subvol_id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- bool found = false;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
- continue;
- if (!BCH_SUBVOLUME_SNAP(s.v)) {
- *subvol_id = s.k->p.offset;
- found = true;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && !found) {
- struct bkey_i_subvolume *u;
-
- *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
-
- u = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, *subvol_id),
- 0, subvolume);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- SET_BCH_SUBVOLUME_SNAP(&u->v, false);
- }
-
- return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot_tree st;
- struct bch_snapshot s;
- struct bch_subvolume subvol;
- struct printbuf buf = PRINTBUF;
- struct btree_iter snapshot_iter = {};
- u32 root_id;
- int ret;
-
- if (k.k->type != KEY_TYPE_snapshot_tree)
- return 0;
-
- st = bkey_s_c_to_snapshot_tree(k);
- root_id = le32_to_cpu(st.v->root_snapshot);
-
- struct bkey_s_c_snapshot snapshot_k =
- bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
- POS(0, root_id), 0, snapshot);
- ret = bkey_err(snapshot_k);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (!ret)
- bkey_val_copy(&s, snapshot_k);
-
- if (fsck_err_on(ret ||
- root_id != bch2_snapshot_root(c, root_id) ||
- st.k->p.offset != le32_to_cpu(s.tree),
- trans, snapshot_tree_to_missing_snapshot,
- "snapshot tree points to missing/incorrect snapshot:\n%s",
- (bch2_bkey_val_to_text(&buf, c, st.s_c),
- prt_newline(&buf),
- ret
- ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
- : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
- buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto err;
- }
-
- if (!st.v->master_subvol)
- goto out;
-
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret,
- trans, snapshot_tree_to_missing_subvol,
- "snapshot tree points to missing subvolume:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor(c,
- le32_to_cpu(subvol.snapshot),
- root_id),
- trans, snapshot_tree_to_wrong_subvol,
- "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
- trans, snapshot_tree_to_snapshot_subvol,
- "snapshot tree points to snapshot subvolume:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
- struct bkey_i_snapshot_tree *u;
- u32 subvol_id;
-
- ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
- bch_err_fn(c, ret);
-
- if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
- ret = 0;
- goto err;
- }
-
- if (ret)
- goto err;
-
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.master_subvol = cpu_to_le32(subvol_id);
- st = snapshot_tree_i_to_s_c(u);
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &snapshot_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_snapshot_trees, POS_MIN,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_snapshot_tree(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
- u32 snap_id, u32 tree_id)
-{
- struct bch_snapshot_tree s_t;
- int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
- if (bch2_err_matches(ret, ENOENT))
- return 0;
- if (ret)
- return ret;
-
- return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
- if (!id)
- return 0;
-
- guard(rcu)();
- const struct snapshot_t *s = snapshot_t(c, id);
- return s->parent
- ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
- : id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
-{
- unsigned i;
-
- for (i = 0; i < 3; i++)
- if (!s.parent) {
- if (s.skip[i])
- return false;
- } else {
- if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
- return false;
- }
-
- return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_snapshot *s)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter root_iter;
- struct bch_snapshot_tree s_t;
- struct bkey_s_c_snapshot root;
- struct bkey_i_snapshot *u;
- u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
- int ret;
-
- root = bch2_bkey_get_iter_typed(trans, &root_iter,
- BTREE_ID_snapshots, POS(0, root_id),
- BTREE_ITER_with_updates, snapshot);
- ret = bkey_err(root);
- if (ret)
- goto err;
-
- tree_id = le32_to_cpu(root.v->tree);
-
- ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
- u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u) ?:
- bch2_snapshot_tree_create(trans, root_id,
- bch2_snapshot_oldest_subvol(c, root_id, NULL),
- &tree_id);
- if (ret)
- goto err;
-
- u->v.tree = cpu_to_le32(tree_id);
- if (k.k->p.offset == root_id)
- *s = u->v;
- }
-
- if (k.k->p.offset != root_id) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.tree = cpu_to_le32(tree_id);
- *s = u->v;
- }
-err:
- bch2_trans_iter_exit(trans, &root_iter);
- return ret;
-}
-
-static int check_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bch_snapshot s;
- struct bch_subvolume subvol;
- struct bch_snapshot v;
- struct bkey_i_snapshot *u;
- u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
- u32 real_depth;
- struct printbuf buf = PRINTBUF;
- u32 i, id;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- memset(&s, 0, sizeof(s));
- memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
-
- if (BCH_SNAPSHOT_DELETED(&s))
- return 0;
-
- id = le32_to_cpu(s.parent);
- if (id) {
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot with nonexistent parent:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (ret)
- goto err;
-
- if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
- le32_to_cpu(v.children[1]) != k.k->p.offset) {
- bch_err(c, "snapshot parent %u missing pointer to child %llu",
- id, k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- }
-
- for (i = 0; i < 2 && s.children[i]; i++) {
- id = le32_to_cpu(s.children[i]);
-
- ret = bch2_snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot node %llu has nonexistent child %u",
- k.k->p.offset, id);
- if (ret)
- goto err;
-
- if (le32_to_cpu(v.parent) != k.k->p.offset) {
- bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
- id, le32_to_cpu(v.parent), k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- }
-
- bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
- !BCH_SNAPSHOT_WILL_DELETE(&s);
-
- if (should_have_subvol) {
- id = le32_to_cpu(s.subvol);
- ret = bch2_subvolume_get(trans, id, false, &subvol);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (ret)
- goto err;
-
- if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
- bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
- k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- } else {
- if (fsck_err_on(s.subvol,
- trans, snapshot_should_not_have_subvol,
- "snapshot should not point to subvol:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.subvol = 0;
- s = u->v;
- }
- }
-
- ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret,
- trans, snapshot_to_bad_snapshot_tree,
- "snapshot points to missing/incorrect tree:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
- if (ret)
- goto err;
- }
- ret = 0;
-
- real_depth = bch2_snapshot_depth(c, parent_id);
-
- if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
- trans, snapshot_bad_depth,
- "snapshot with incorrect depth field, should be %u:\n%s",
- real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.depth = cpu_to_le32(real_depth);
- s = u->v;
- }
-
- ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret,
- trans, snapshot_bad_skiplist,
- "snapshot with bad skiplist field:\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
- u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
-
- bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
- s = u->v;
- }
- ret = 0;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
- /*
- * We iterate backwards as checking/fixing the depth field requires that
- * the parent's depth already be correct:
- */
- int ret = bch2_trans_run(c,
- for_each_btree_key_reverse_commit(trans, iter,
- BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_snapshot(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_snapshot_exists(struct btree_trans *trans, u32 id)
-{
- struct bch_fs *c = trans->c;
-
- /* Do we need to reconstruct the snapshot_tree entry as well? */
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
- u32 tree_id = 0;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
- 0, k, ret) {
- if (k.k->type == KEY_TYPE_snapshot_tree &&
- le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
- tree_id = k.k->p.offset;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- return ret;
-
- if (!tree_id) {
- ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
- if (ret)
- return ret;
- }
-
- struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
- ret = PTR_ERR_OR_ZERO(snapshot);
- if (ret)
- return ret;
-
- bkey_snapshot_init(&snapshot->k_i);
- snapshot->k.p = POS(0, id);
- snapshot->v.tree = cpu_to_le32(tree_id);
- snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
- 0, k, ret) {
- if (k.k->type == KEY_TYPE_subvolume &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
- snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
- SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return bch2_snapshot_table_make_room(c, id) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
-}
-
-/* Figure out which snapshot nodes belong in the same tree: */
-struct snapshot_tree_reconstruct {
- enum btree_id btree;
- struct bpos cur_pos;
- snapshot_id_list cur_ids;
- DARRAY(snapshot_id_list) trees;
-};
-
-static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
-{
- darray_for_each(r->trees, i)
- darray_exit(i);
- darray_exit(&r->trees);
- darray_exit(&r->cur_ids);
-}
-
-static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
- return r->btree == BTREE_ID_inodes
- ? r->cur_pos.offset == pos.offset
- : r->cur_pos.inode == pos.inode;
-}
-
-static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
-{
- return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
-}
-
-static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
-{
- bool first = true;
- darray_for_each(*s, i) {
- if (!first)
- prt_char(out, ' ');
- first = false;
- prt_printf(out, "%u", *i);
- }
-}
-
-static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
-{
- if (r->cur_ids.nr) {
- darray_for_each(r->trees, i)
- if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
- int ret = snapshot_list_merge(c, i, &r->cur_ids);
- if (ret)
- return ret;
- goto out;
- }
- darray_push(&r->trees, r->cur_ids);
- darray_init(&r->cur_ids);
- }
-out:
- r->cur_ids.nr = 0;
- return 0;
-}
-
-static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
-{
- if (!same_snapshot(r, pos))
- snapshot_tree_reconstruct_next(c, r);
- r->cur_pos = pos;
- return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
-}
-
-int bch2_reconstruct_snapshots(struct bch_fs *c)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct printbuf buf = PRINTBUF;
- struct snapshot_tree_reconstruct r = {};
- int ret = 0;
-
- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- if (btree_type_has_snapshots(btree)) {
- r.btree = btree;
-
- ret = for_each_btree_key(trans, iter, btree, POS_MIN,
- BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
- get_snapshot_trees(c, &r, k.k->p);
- }));
- if (ret)
- goto err;
-
- snapshot_tree_reconstruct_next(c, &r);
- }
- }
-
- darray_for_each(r.trees, t) {
- printbuf_reset(&buf);
- snapshot_id_list_to_text(&buf, t);
-
- darray_for_each(*t, id) {
- if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
- trans, snapshot_node_missing,
- "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
- if (t->nr > 1) {
- bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
- ret = bch_err_throw(c, fsck_repair_unimplemented);
- goto err;
- }
-
- ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_snapshot_exists(trans, *id));
- if (ret)
- goto err;
- }
- }
- }
-fsck_err:
-err:
- bch2_trans_put(trans);
- snapshot_tree_reconstruct_exit(&r);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-int __bch2_check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
- enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
-
- /* Snapshot was definitively deleted, this error is marked autofix */
- if (fsck_err_on(state == SNAPSHOT_ID_deleted,
- trans, bkey_in_deleted_snapshot,
- "key in deleted snapshot %s, delete?",
- (bch2_btree_id_to_text(&buf, iter->btree_id),
- prt_char(&buf, ' '),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
-
- if (state == SNAPSHOT_ID_empty) {
- /*
- * Snapshot missing: we should have caught this with btree_lost_data and
- * kicked off reconstruct_snapshots, so if we end up here we have no
- * idea what happened.
- *
- * Do not delete unless we know that subvolumes and snapshots
- * are consistent:
- *
- * XXX:
- *
- * We could be smarter here, and instead of using the generic
- * recovery pass ratelimiting, track if there have been any
- * changes to the snapshots or inodes btrees since those passes
- * last ran.
- */
- ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
- ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
-
- if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
- ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
-
- unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
-
- if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
- "key in missing snapshot %s, delete?",
- (bch2_btree_id_to_text(&buf, iter->btree_id),
- prt_char(&buf, ' '),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
- }
- }
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos,
- snapshot_id_list *s)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
- BTREE_ITER_all_snapshots, k, ret) {
- if (!bkey_eq(k.k->p, pos))
- break;
-
- if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
- snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
- continue;
-
- ret = snapshot_list_add(c, s, k.k->p.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- darray_exit(s);
-
- return ret;
-}
-
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
- struct btree_iter iter;
- struct bkey_i_snapshot *s =
- bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, id),
- 0, snapshot);
- int ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
- trans->c, "missing snapshot %u", id);
- return ret;
- }
-
- /* already deleted? */
- if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
- goto err;
-
- SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
- SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
- s->v.subvol = 0;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
-{
- if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
- swap(s->children[0], s->children[1]);
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, p_iter = {};
- struct btree_iter c_iter = {};
- struct btree_iter tree_iter = {};
- u32 parent_id, child_id;
- unsigned i;
- int ret = 0;
-
- struct bkey_i_snapshot *s =
- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_intent, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", id);
-
- if (ret)
- goto err;
-
- BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
- BUG_ON(s->v.children[1]);
-
- parent_id = le32_to_cpu(s->v.parent);
- child_id = le32_to_cpu(s->v.children[0]);
-
- if (parent_id) {
- struct bkey_i_snapshot *parent;
-
- parent = bch2_bkey_get_mut_typed(trans, &p_iter,
- BTREE_ID_snapshots, POS(0, parent_id),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(parent);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", parent_id);
- if (unlikely(ret))
- goto err;
-
- /* find entry in parent->children for node being deleted */
- for (i = 0; i < 2; i++)
- if (le32_to_cpu(parent->v.children[i]) == id)
- break;
-
- if (bch2_fs_inconsistent_on(i == 2, c,
- "snapshot %u missing child pointer to %u",
- parent_id, id))
- goto err;
-
- parent->v.children[i] = cpu_to_le32(child_id);
-
- normalize_snapshot_child_pointers(&parent->v);
- }
-
- if (child_id) {
- struct bkey_i_snapshot *child;
-
- child = bch2_bkey_get_mut_typed(trans, &c_iter,
- BTREE_ID_snapshots, POS(0, child_id),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(child);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", child_id);
- if (unlikely(ret))
- goto err;
-
- child->v.parent = cpu_to_le32(parent_id);
-
- if (!child->v.parent) {
- child->v.skip[0] = 0;
- child->v.skip[1] = 0;
- child->v.skip[2] = 0;
- }
- }
-
- if (!parent_id) {
- /*
- * We're deleting the root of a snapshot tree: update the
- * snapshot_tree entry to point to the new root, or delete it if
- * this is the last snapshot ID in this tree:
- */
- struct bkey_i_snapshot_tree *s_t;
-
- BUG_ON(s->v.children[1]);
-
- s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(s_t);
- if (ret)
- goto err;
-
- if (s->v.children[0]) {
- s_t->v.root_snapshot = s->v.children[0];
- } else {
- s_t->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&s_t->k, 0);
- }
- }
-
- if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
- SET_BCH_SNAPSHOT_DELETED(&s->v, true);
- s->v.parent = 0;
- s->v.children[0] = 0;
- s->v.children[1] = 0;
- s->v.subvol = 0;
- s->v.tree = 0;
- s->v.depth = 0;
- s->v.skip[0] = 0;
- s->v.skip[1] = 0;
- s->v.skip[2] = 0;
- } else {
- s->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&s->k, 0);
- }
-err:
- bch2_trans_iter_exit(trans, &tree_iter);
- bch2_trans_iter_exit(trans, &p_iter);
- bch2_trans_iter_exit(trans, &c_iter);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_snapshot *n;
- struct bkey_s_c k;
- unsigned i, j;
- u32 depth = bch2_snapshot_depth(c, parent);
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
- POS_MIN, BTREE_ITER_intent);
- k = bch2_btree_iter_peek(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- for (i = 0; i < nr_snapids; i++) {
- k = bch2_btree_iter_prev_slot(trans, &iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k || !k.k->p.offset) {
- ret = bch_err_throw(c, ENOSPC_snapshot_create);
- goto err;
- }
-
- n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.flags = 0;
- n->v.parent = cpu_to_le32(parent);
- n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
- n->v.tree = cpu_to_le32(tree);
- n->v.depth = cpu_to_le32(depth);
- n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
- n->v.btime.hi = 0;
-
- for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
- n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
-
- bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
- SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
- ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
- if (ret)
- goto err;
-
- new_snapids[i] = iter.pos.offset;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct btree_iter iter;
- struct bkey_i_snapshot *n_parent;
- int ret = 0;
-
- n_parent = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, parent),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(n_parent);
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not found", parent);
- return ret;
- }
-
- if (n_parent->v.children[0] || n_parent->v.children[1]) {
- bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
- ret = -EINVAL;
- goto err;
- }
-
- ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
- new_snapids, snapshot_subvols, nr_snapids);
- if (ret)
- goto err;
-
- n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
- n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
- n_parent->v.subvol = 0;
- SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct bkey_i_snapshot_tree *n_tree;
- int ret;
-
- n_tree = __bch2_snapshot_tree_create(trans);
- ret = PTR_ERR_OR_ZERO(n_tree) ?:
- create_snapids(trans, 0, n_tree->k.p.offset,
- new_snapids, snapshot_subvols, nr_snapids);
- if (ret)
- return ret;
-
- n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
- n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
- return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- BUG_ON((parent == 0) != (nr_snapids == 1));
- BUG_ON((parent != 0) != (nr_snapids == 2));
-
- return parent
- ? bch2_snapshot_node_create_children(trans, parent,
- new_snapids, snapshot_subvols, nr_snapids)
- : bch2_snapshot_node_create_tree(trans,
- new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-/*
- * If we have an unlinked inode in an internal snapshot node, and the inode
- * really has been deleted in all child snapshots, how does this get cleaned up?
- *
- * first there is the problem of how keys that have been overwritten in all
- * child snapshots get deleted (unimplemented?), but inodes may perhaps be
- * special?
- *
- * also: unlinked inode in internal snapshot appears to not be getting deleted
- * correctly if inode doesn't exist in leaf snapshots
- *
- * solution:
- *
- * for a key in an interior snapshot node that needs work to be done that
- * requires it to be mutated: iterate over all descendent leaf nodes and copy
- * that key to snapshot leaf nodes, where we can mutate it
- */
-
-static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
-{
- struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
- return i ? i->live_child : 0;
-}
-
-static unsigned __live_child(struct snapshot_table *t, u32 id,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
-{
- struct snapshot_t *s = __snapshot_t(t, id);
- if (!s)
- return 0;
-
- for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
- if (s->children[i] &&
- !snapshot_list_has_id(delete_leaves, s->children[i]) &&
- !interior_delete_has_id(delete_interior, s->children[i]))
- return s->children[i];
-
- for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
- u32 live_child = s->children[i]
- ? __live_child(t, s->children[i], delete_leaves, delete_interior)
- : 0;
- if (live_child)
- return live_child;
- }
-
- return 0;
-}
-
-static unsigned live_child(struct bch_fs *c, u32 id)
-{
- struct snapshot_delete *d = &c->snapshot_delete;
-
- guard(rcu)();
- return __live_child(rcu_dereference(c->snapshots), id,
- &d->delete_leaves, &d->delete_interior);
-}
-
-static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
-{
- return snapshot_list_has_id(&d->delete_leaves, id) ||
- interior_delete_has_id(&d->delete_interior, id) != 0;
-}
-
-static int delete_dead_snapshots_process_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct snapshot_delete *d = &trans->c->snapshot_delete;
-
- if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
-
- u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
- if (live_child) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- new->k.p.snapshot = live_child;
-
- struct btree_iter dst_iter;
- struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
- iter->btree_id, new->k.p,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_intent);
- ret = bkey_err(dst_k);
- if (ret)
- return ret;
-
- ret = (bkey_deleted(dst_k.k)
- ? bch2_trans_update(trans, &dst_iter, new,
- BTREE_UPDATE_internal_snapshot_node)
- : 0) ?:
- bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &dst_iter);
- return ret;
- }
-
- return 0;
-}
-
-static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_delete *d = &c->snapshot_delete;
-
- u64 inum = iter->btree_id != BTREE_ID_inodes
- ? iter->pos.inode
- : iter->pos.offset;
-
- if (*prev_inum == inum)
- return false;
-
- *prev_inum = inum;
-
- bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
- bch2_snapshot_tree(c, iter->pos.snapshot));
- if (unlikely(ret)) {
- struct bpos pos = iter->pos;
- pos.snapshot = 0;
- if (iter->btree_id != BTREE_ID_inodes)
- pos.offset = U64_MAX;
- bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
- }
-
- return ret;
-}
-
-static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_delete *d = &c->snapshot_delete;
-
- for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
- struct disk_reservation res = { 0 };
- u64 prev_inum = 0;
-
- d->pos.pos = POS_MIN;
-
- if (!btree_type_has_snapshots(d->pos.btree))
- continue;
-
- int ret = for_each_btree_key_commit(trans, iter,
- d->pos.btree, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- d->pos.pos = iter.pos;
-
- if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
- continue;
-
- delete_dead_snapshots_process_key(trans, &iter, k);
- }));
-
- bch2_disk_reservation_put(c, &res);
-
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
- struct bpos start, struct bpos end)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_delete *d = &c->snapshot_delete;
- struct disk_reservation res = { 0 };
-
- d->pos.btree = btree;
- d->pos.pos = POS_MIN;
-
- int ret = for_each_btree_key_max_commit(trans, iter,
- btree, start, end,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- d->pos.pos = iter.pos;
- delete_dead_snapshots_process_key(trans, &iter, k);
- }));
-
- bch2_disk_reservation_put(c, &res);
- return ret;
-}
-
-static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_delete *d = &c->snapshot_delete;
- struct disk_reservation res = { 0 };
- u64 prev_inum = 0;
- int ret = 0;
-
- struct btree_iter iter;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
-
- while (1) {
- struct bkey_s_c k;
- ret = lockrestart_do(trans,
- bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
- if (ret)
- break;
-
- if (!k.k)
- break;
-
- d->pos.btree = iter.btree_id;
- d->pos.pos = iter.pos;
-
- if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
- continue;
-
- if (snapshot_id_dying(d, k.k->p.snapshot)) {
- struct bpos start = POS(k.k->p.offset, 0);
- struct bpos end = POS(k.k->p.offset, U64_MAX);
-
- ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
- delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
- delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
- if (ret)
- break;
-
- bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
- } else {
- bch2_btree_iter_advance(trans, &iter);
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- goto err;
-
- prev_inum = 0;
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- d->pos.btree = iter.btree_id;
- d->pos.pos = iter.pos;
-
- if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
- continue;
-
- delete_dead_snapshots_process_key(trans, &iter, k);
- }));
-err:
- bch2_disk_reservation_put(c, &res);
- return ret;
-}
-
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
-{
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- struct bch_fs *c = trans->c;
- struct snapshot_delete *d = &c->snapshot_delete;
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- unsigned live_children = 0;
- int ret = 0;
-
- if (BCH_SNAPSHOT_SUBVOL(s.v))
- return 0;
-
- if (BCH_SNAPSHOT_DELETED(s.v))
- return 0;
-
- mutex_lock(&d->progress_lock);
- for (unsigned i = 0; i < 2; i++) {
- u32 child = le32_to_cpu(s.v->children[i]);
-
- live_children += child &&
- !snapshot_list_has_id(&d->delete_leaves, child);
- }
-
- u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
-
- if (live_children == 0) {
- ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
- snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
- } else if (live_children == 1) {
- struct snapshot_interior_delete n = {
- .id = s.k->p.offset,
- .live_child = live_child(c, s.k->p.offset),
- };
-
- if (!n.live_child) {
- bch_err(c, "error finding live child of snapshot %u", n.id);
- ret = -EINVAL;
- } else {
- ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
- darray_push(&d->delete_interior, n);
- }
- }
- mutex_unlock(&d->progress_lock);
-
- return ret;
-}
-
-static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
- interior_delete_list *skip)
-{
- guard(rcu)();
- while (interior_delete_has_id(skip, id))
- id = __bch2_snapshot_parent(c, id);
-
- while (n--) {
- do {
- id = __bch2_snapshot_parent(c, id);
- } while (interior_delete_has_id(skip, id));
- }
-
- return id;
-}
-
-static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
- struct btree_iter *iter, struct bkey_s_c k,
- interior_delete_list *deleted)
-{
- struct bch_fs *c = trans->c;
- u32 nr_deleted_ancestors = 0;
- struct bkey_i_snapshot *s;
- int ret;
-
- if (!bch2_snapshot_exists(c, k.k->p.offset))
- return 0;
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- if (interior_delete_has_id(deleted, k.k->p.offset))
- return 0;
-
- s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
-
- if (!nr_deleted_ancestors)
- return 0;
-
- le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
-
- if (!s->v.depth) {
- s->v.skip[0] = 0;
- s->v.skip[1] = 0;
- s->v.skip[2] = 0;
- } else {
- u32 depth = le32_to_cpu(s->v.depth);
- u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
-
- for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
- u32 id = le32_to_cpu(s->v.skip[j]);
-
- if (interior_delete_has_id(deleted, id)) {
- id = bch2_snapshot_nth_parent_skip(c,
- parent,
- depth > 1
- ? get_random_u32_below(depth - 1)
- : 0,
- deleted);
- s->v.skip[j] = cpu_to_le32(id);
- }
- }
-
- bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
- }
-
- return bch2_trans_update(trans, iter, &s->k_i, 0);
-}
-
-static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
-{
- prt_printf(out, "deleting from trees");
- darray_for_each(d->deleting_from_trees, i)
- prt_printf(out, " %u", *i);
-
- prt_printf(out, "deleting leaves");
- darray_for_each(d->delete_leaves, i)
- prt_printf(out, " %u", *i);
- prt_newline(out);
-
- prt_printf(out, "interior");
- darray_for_each(d->delete_interior, i)
- prt_printf(out, " %u->%u", i->id, i->live_child);
- prt_newline(out);
-}
-
-int __bch2_delete_dead_snapshots(struct bch_fs *c)
-{
- struct snapshot_delete *d = &c->snapshot_delete;
- int ret = 0;
-
- if (!mutex_trylock(&d->lock))
- return 0;
-
- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
- goto out_unlock;
-
- struct btree_trans *trans = bch2_trans_get(c);
-
- /*
- * For every snapshot node: If we have no live children and it's not
- * pointed to by a subvolume, delete it:
- */
- d->running = true;
- d->pos = BBPOS_MIN;
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
- check_should_delete_snapshot(trans, k));
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "walking snapshots");
- if (ret)
- goto err;
-
- if (!d->delete_leaves.nr && !d->delete_interior.nr)
- goto err;
-
- {
- struct printbuf buf = PRINTBUF;
- bch2_snapshot_delete_nodes_to_text(&buf, d);
-
- ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
- printbuf_exit(&buf);
- if (ret)
- goto err;
- }
-
- ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
- ? delete_dead_snapshot_keys_v2(trans)
- : delete_dead_snapshot_keys_v1(trans);
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
- if (ret)
- goto err;
-
- darray_for_each(d->delete_leaves, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, *i));
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
- if (ret)
- goto err;
- }
-
- /*
- * Fixing children of deleted snapshots can't be done completely
- * atomically, if we crash between here and when we delete the interior
- * nodes some depth fields will be off:
- */
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
- BTREE_ITER_intent, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
- if (ret)
- goto err;
-
- darray_for_each(d->delete_interior, i) {
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(trans, i->id));
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting snapshot %u", i->id);
- if (ret)
- goto err;
- }
-err:
- mutex_lock(&d->progress_lock);
- darray_exit(&d->deleting_from_trees);
- darray_exit(&d->delete_interior);
- darray_exit(&d->delete_leaves);
- d->running = false;
- mutex_unlock(&d->progress_lock);
- bch2_trans_put(trans);
-
- bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
-out_unlock:
- mutex_unlock(&d->lock);
- if (!bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
- return ret;
-}
-
-int bch2_delete_dead_snapshots(struct bch_fs *c)
-{
- if (!c->opts.auto_snapshot_deletion)
- return 0;
-
- return __bch2_delete_dead_snapshots(c);
-}
-
-void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
-
- set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
-
- bch2_delete_dead_snapshots(c);
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
- if (!c->opts.auto_snapshot_deletion)
- return;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
- return;
-
- BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
- if (!queue_work(system_long_wq, &c->snapshot_delete.work))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct snapshot_delete *d = &c->snapshot_delete;
-
- if (!d->running) {
- prt_str(out, "(not running)");
- return;
- }
-
- mutex_lock(&d->progress_lock);
- bch2_snapshot_delete_nodes_to_text(out, d);
-
- bch2_bbpos_to_text(out, d->pos);
- mutex_unlock(&d->progress_lock);
-}
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
- enum btree_id id,
- struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots,
- k, ret) {
- if (!bkey_eq(pos, k.k->p))
- break;
-
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
- ret = 1;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
-{
- /* If there's one child, it's redundant and keys will be moved to the child */
- return !!snap.v->children[0] + !!snap.v->children[1] == 1;
-}
-
-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
-{
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
- interior_snapshot_needs_delete(snap))
- set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
-
- return 0;
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
- /*
- * Initializing the is_ancestor bitmaps requires ancestors to already be
- * initialized - so mark in reverse:
- */
- int ret = bch2_trans_run(c,
- for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
- POS_MAX, 0, k,
- __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_check_snapshot_needs_deletion(trans, k)));
- bch_err_fn(c, ret);
-
- /*
- * It's important that we check if we need to reconstruct snapshots
- * before going RW, so we mark that pass as required in the superblock -
- * otherwise, we could end up deleting keys with missing snapshot nodes
- * instead
- */
- BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
- test_bit(BCH_FS_may_go_rw, &c->flags));
-
- return ret;
-}
-
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
- kvfree(rcu_dereference_protected(c->snapshots, true));
-}
-
-void bch2_fs_snapshots_init_early(struct bch_fs *c)
-{
- INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
- mutex_init(&c->snapshot_delete.lock);
- mutex_init(&c->snapshot_delete.progress_lock);
- mutex_init(&c->snapshots_unlinked_lock);
-}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
deleted file mode 100644
index 6766bf673ed9..000000000000
--- a/fs/bcachefs/snapshot.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_H
-#define _BCACHEFS_SNAPSHOT_H
-
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
- .key_validate = bch2_snapshot_tree_validate, \
- .val_to_text = bch2_snapshot_tree_to_text, \
- .min_val_size = 8, \
-})
-
-struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
- .key_validate = bch2_snapshot_validate, \
- .val_to_text = bch2_snapshot_to_text, \
- .trigger = bch2_mark_snapshot, \
- .min_val_size = 24, \
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
- u32 idx = U32_MAX - id;
-
- return likely(t && idx < t->nr)
- ? &t->s[idx]
- : NULL;
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
- return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
- const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->tree : 0;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->parent : 0;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
- return __bch2_snapshot_parent_early(c, id);
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
- if (!s)
- return 0;
-
- u32 parent = s->parent;
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- parent &&
- s->depth != snapshot_t(c, parent)->depth + 1)
- panic("id %u depth=%u parent %u depth=%u\n",
- id, snapshot_t(c, id)->depth,
- parent, snapshot_t(c, parent)->depth);
-
- return parent;
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
- return __bch2_snapshot_parent(c, id);
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
- guard(rcu)();
- while (n--)
- id = __bch2_snapshot_parent(c, id);
- return id;
-}
-
-u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *);
-u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
-
- u32 parent;
- while ((parent = __bch2_snapshot_parent(c, id)))
- id = parent;
- return id;
-}
-
-static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->state : SNAPSHOT_ID_empty;
-}
-
-static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
- return __bch2_snapshot_id_state(c, id);
-}
-
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
-}
-
-static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
- const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
-}
-
-static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- int ret = bch2_snapshot_is_internal_node(c, id);
- if (ret < 0)
- return ret;
- return !ret;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
- guard(rcu)();
- return parent ? snapshot_t(c, parent)->depth + 1 : 0;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
- return id == ancestor
- ? true
- : __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
- guard(rcu)();
- const struct snapshot_t *t = snapshot_t(c, id);
- return t && (t->children[0]|t->children[1]) != 0;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
- return darray_find(*s, id) != NULL;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- darray_for_each(*s, i)
- if (bch2_snapshot_is_ancestor(c, id, *i))
- return true;
- return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- BUG_ON(snapshot_list_has_id(s, id));
- int ret = darray_push(s, id);
- if (ret)
- bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
- return ret;
-}
-
-static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- int ret = snapshot_list_has_id(s, id)
- ? 0
- : darray_push(s, id);
- if (ret)
- bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
- return ret;
-}
-
-static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
-{
- darray_for_each(*src, i) {
- int ret = snapshot_list_add_nodup(c, dst, *i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot *s);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
- struct bch_subvolume *);
-
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
- u32 *, u32 *, unsigned);
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
-int bch2_reconstruct_snapshots(struct bch_fs *);
-
-int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
-
-static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
- ? 0
- : __bch2_check_key_has_snapshot(trans, iter, k);
-}
-
-int __bch2_get_snapshot_overwrites(struct btree_trans *,
- enum btree_id, struct bpos,
- snapshot_id_list *);
-
-/*
- * Get a list of snapshot IDs that have overwritten a given key:
- */
-static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos,
- snapshot_id_list *s)
-{
- darray_init(s);
-
- return bch2_snapshot_has_children(trans->c, pos.snapshot)
- ? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
- : 0;
-
-}
-
-int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-
-int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
-
-static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
- enum btree_id id,
- struct bpos pos)
-{
- if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
- return 0;
-
- return __bch2_key_has_snapshot_overwrites(trans, id, pos);
-}
-
-int __bch2_delete_dead_snapshots(struct bch_fs *);
-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_work(struct work_struct *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
-
-int bch2_snapshots_read(struct bch_fs *);
-void bch2_fs_snapshots_exit(struct bch_fs *);
-void bch2_fs_snapshots_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
deleted file mode 100644
index 9bccae1f3590..000000000000
--- a/fs/bcachefs/snapshot_format.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
-#define _BCACHEFS_SNAPSHOT_FORMAT_H
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
- bch_le128 btime;
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1)
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h
deleted file mode 100644
index 0ab698f13e5c..000000000000
--- a/fs/bcachefs/snapshot_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
-#define _BCACHEFS_SNAPSHOT_TYPES_H
-
-#include "bbpos_types.h"
-#include "darray.h"
-#include "subvolume_types.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP 128
-
-struct snapshot_t {
- enum snapshot_id_state {
- SNAPSHOT_ID_empty,
- SNAPSHOT_ID_live,
- SNAPSHOT_ID_deleted,
- } state;
- u32 parent;
- u32 skip[3];
- u32 depth;
- u32 children[2];
- u32 subvol; /* Nonzero only if a subvolume points to this node: */
- u32 tree;
- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
- struct rcu_head rcu;
- size_t nr;
-#ifndef RUST_BINDGEN
- DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
- struct snapshot_t s[0];
-#endif
-};
-
-struct snapshot_interior_delete {
- u32 id;
- u32 live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
-struct snapshot_delete {
- struct mutex lock;
- struct work_struct work;
-
- struct mutex progress_lock;
- snapshot_id_list deleting_from_trees;
- snapshot_id_list delete_leaves;
- interior_delete_list delete_interior;
-
- bool running;
- struct bbpos pos;
-};
-
-#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
deleted file mode 100644
index 3e9f59226bdf..000000000000
--- a/fs/bcachefs/str_hash.c
+++ /dev/null
@@ -1,400 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "fsck.h"
-#include "str_hash.h"
-#include "subvolume.h"
-
-static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
-{
- if (d.v->d_type == DT_SUBVOL) {
- struct bch_subvolume subvol;
- int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol),
- false, &subvol);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
- return !ret;
- } else {
- struct btree_iter iter;
- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bkey_is_inode(k.k);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
- }
-}
-
-static int bch2_fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old,
- bool *updated_before_k_pos)
-{
- struct bch_fs *c = trans->c;
- struct qstr old_name = bch2_dirent_get_name(old);
- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
- int ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bkey_dirent_init(&new->k_i);
- dirent_copy_target(new, old);
- new->k.p = old.k->p;
-
- char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
- ret = PTR_ERR_OR_ZERO(renamed_buf);
- if (ret)
- return ret;
-
- for (unsigned i = 0; i < 1000; i++) {
- new->k.u64s = BKEY_U64s_MAX;
-
- struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
- sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
- old_name.len, old_name.name, i));
-
- ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL);
- if (ret)
- return ret;
-
- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- (subvol_inum) { 0, old.k->p.inode },
- old.k->p.snapshot, &new->k_i,
- BTREE_UPDATE_internal_snapshot_node|
- STR_HASH_must_create);
- if (ret && !bch2_err_matches(ret, EEXIST))
- break;
- if (!ret) {
- if (bpos_lt(new->k.p, old.k->p))
- *updated_before_k_pos = true;
- break;
- }
- }
-
- ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static noinline int hash_pick_winner(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c k1,
- struct bkey_s_c k2)
-{
- if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
- !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
- return 0;
-
- switch (desc.btree_id) {
- case BTREE_ID_dirents: {
- int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1));
- if (ret < 0)
- return ret;
- if (!ret)
- return 0;
-
- ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2));
- if (ret < 0)
- return ret;
- if (!ret)
- return 1;
- return 2;
- }
- default:
- return 0;
- }
-}
-
-/*
- * str_hash lookups across snapshots break in wild ways if hash_info in
- * different snapshot versions doesn't match - so if we find one mismatch, check
- * them all
- */
-int bch2_repair_inode_hash_info(struct btree_trans *trans,
- struct bch_inode_unpacked *snapshot_root)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct printbuf buf = PRINTBUF;
- bool need_commit = false;
- int ret = 0;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes,
- POS(0, snapshot_root->bi_inum),
- BTREE_ITER_all_snapshots, k, ret) {
- if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot)))
- break;
- if (!bkey_is_inode(k.k))
- continue;
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- break;
-
- if (inode.bi_hash_seed == snapshot_root->bi_hash_seed &&
- INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) {
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root);
- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
-
- BUG_ON(hash1.type != hash2.type ||
- memcmp(&hash1.siphash_key,
- &hash2.siphash_key,
- sizeof(hash1.siphash_key)));
-#endif
- continue;
- }
-
- printbuf_reset(&buf);
- prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n",
- snapshot_root->bi_inum,
- inode.bi_snapshot,
- snapshot_root->bi_snapshot);
-
- bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode));
- prt_printf(&buf, " %llx\n", inode.bi_hash_seed);
-
- bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
- prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed);
-
- if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) {
- inode.bi_hash_seed = snapshot_root->bi_hash_seed;
- SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
-
- ret = __bch2_fsck_write_inode(trans, &inode);
- if (ret)
- break;
- need_commit = true;
- }
- }
-
- if (ret)
- goto err;
-
- if (!need_commit) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n",
- snapshot_root->bi_inum);
-
- prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot);
- bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
- prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed);
-#if 0
- prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot);
- bch2_prt_str_hash_type(&buf, hash_info->type);
- prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1);
-#endif
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- ret = bch_err_throw(c, fsck_repair_unimplemented);
- goto err;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * All versions of the same inode in different snapshots must have the same hash
- * seed/type: verify that the hash info we're using matches the root
- */
-static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
- struct bch_hash_info *hash_info)
-{
- struct bch_inode_unpacked snapshot_root;
- int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root);
- if (ret)
- return ret;
-
- struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root);
- if (hash_info->type != hash_root.type ||
- memcmp(&hash_info->siphash_key,
- &hash_root.siphash_key,
- sizeof(hash_root.siphash_key)))
- ret = bch2_repair_inode_hash_info(trans, &snapshot_root);
-
- return ret;
-}
-
-/* Put a str_hash key in its proper location, checking for duplicates */
-int bch2_str_hash_repair_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc *desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c k,
- struct btree_iter *dup_iter, struct bkey_s_c dup_k,
- bool *updated_before_k_pos)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- bool free_snapshots_seen = false;
- int ret = 0;
-
- if (!s) {
- s = bch2_trans_kmalloc(trans, sizeof(*s));
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- goto out;
-
- s->pos = k_iter->pos;
- darray_init(&s->ids);
-
- ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
- if (ret)
- goto out;
-
- free_snapshots_seen = true;
- }
-
- if (!dup_k.k) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
- (subvol_inum) { 0, new->k.p.inode },
- new->k.p.snapshot, new,
- STR_HASH_must_create|
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node);
- ret = bkey_err(dup_k);
- if (ret)
- goto out;
- if (dup_k.k)
- goto duplicate_entries;
-
- if (bpos_lt(new->k.p, k.k->p))
- *updated_before_k_pos = true;
-
- ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
- k_iter->pos, new->k.p) ?:
- bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_commit;
- } else {
-duplicate_entries:
- ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
- if (ret < 0)
- goto out;
-
- if (!fsck_err(trans, hash_table_key_duplicate,
- "duplicate hash table keys%s:\n%s",
- ret != 2 ? "" : ", both point to valid inodes",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, dup_k),
- buf.buf)))
- goto out;
-
- switch (ret) {
- case 0:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- break;
- case 1:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
- break;
- case 2:
- ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
- bkey_s_c_to_dirent(k),
- updated_before_k_pos) ?:
- bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
- BTREE_ITER_with_updates);
- goto out;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_commit;
- }
-out:
-fsck_err:
- bch2_trans_iter_exit(trans, dup_iter);
- printbuf_exit(&buf);
- if (free_snapshots_seen)
- darray_exit(&s->ids);
- return ret;
-}
-
-int __bch2_str_hash_check_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc *desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k,
- bool *updated_before_k_pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter = {};
- struct printbuf buf = PRINTBUF;
- struct bkey_s_c k;
- int ret = 0;
-
- u64 hash = desc->hash_bkey(hash_info, hash_k);
- if (hash_k.k->p.offset < hash)
- goto bad_hash;
-
- for_each_btree_key_norestart(trans, iter, desc->btree_id,
- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_slots|
- BTREE_ITER_with_updates, k, ret) {
- if (bkey_eq(k.k->p, hash_k.k->p))
- break;
-
- if (k.k->type == desc->key_type &&
- !desc->cmp_bkey(k, hash_k)) {
- ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
- hash_info) ?:
- bch2_str_hash_repair_key(trans, s, desc, hash_info,
- k_iter, hash_k,
- &iter, k, updated_before_k_pos);
- break;
- }
-
- if (bkey_deleted(k.k))
- goto bad_hash;
- }
- bch2_trans_iter_exit(trans, &iter);
-out:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-bad_hash:
- bch2_trans_iter_exit(trans, &iter);
- /*
- * Before doing any repair, check hash_info itself:
- */
- ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info);
- if (ret)
- goto out;
-
- if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: should be at %llu\n%s",
- hash,
- (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
- ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
- k_iter, hash_k,
- &iter, bkey_s_c_null,
- updated_before_k_pos);
- goto out;
-}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
deleted file mode 100644
index 8979ac2d7a3b..000000000000
--- a/fs/bcachefs/str_hash.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_STR_HASH_H
-#define _BCACHEFS_STR_HASH_H
-
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "checksum.h"
-#include "error.h"
-#include "inode.h"
-#include "siphash.h"
-#include "subvolume.h"
-#include "super.h"
-
-#include <linux/crc32c.h>
-#include <crypto/sha2.h>
-
-static inline enum bch_str_hash_type
-bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
-{
- switch (opt) {
- case BCH_STR_HASH_OPT_crc32c:
- return BCH_STR_HASH_crc32c;
- case BCH_STR_HASH_OPT_crc64:
- return BCH_STR_HASH_crc64;
- case BCH_STR_HASH_OPT_siphash:
- return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
- ? BCH_STR_HASH_siphash
- : BCH_STR_HASH_siphash_old;
- default:
- BUG();
- }
-}
-
-struct bch_hash_info {
- u32 inum_snapshot;
- u8 type;
- struct unicode_map *cf_encoding;
- /*
- * For crc32 or crc64 string hashes the first key value of
- * the siphash_key (k0) is used as the key.
- */
- SIPHASH_KEY siphash_key;
-};
-
-static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
-{
- struct bch_hash_info info = {
- .inum_snapshot = bi->bi_snapshot,
- .type = INODE_STR_HASH(bi),
- .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
- .siphash_key = { .k0 = bi->bi_hash_seed }
- };
-
- if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
- u8 digest[SHA256_DIGEST_SIZE];
-
- sha256((const u8 *)&bi->bi_hash_seed,
- sizeof(bi->bi_hash_seed), digest);
- memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
- }
-
- return info;
-}
-
-struct bch_str_hash_ctx {
- union {
- u32 crc32c;
- u64 crc64;
- SIPHASH_CTX siphash;
- };
-};
-
-static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
-{
- switch (info->type) {
- case BCH_STR_HASH_crc32c:
- ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
- sizeof(info->siphash_key.k0));
- break;
- case BCH_STR_HASH_crc64:
- ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
- sizeof(info->siphash_key.k0));
- break;
- case BCH_STR_HASH_siphash_old:
- case BCH_STR_HASH_siphash:
- SipHash24_Init(&ctx->siphash, &info->siphash_key);
- break;
- default:
- BUG();
- }
-}
-
-static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info,
- const void *data, size_t len)
-{
- switch (info->type) {
- case BCH_STR_HASH_crc32c:
- ctx->crc32c = crc32c(ctx->crc32c, data, len);
- break;
- case BCH_STR_HASH_crc64:
- ctx->crc64 = crc64_be(ctx->crc64, data, len);
- break;
- case BCH_STR_HASH_siphash_old:
- case BCH_STR_HASH_siphash:
- SipHash24_Update(&ctx->siphash, data, len);
- break;
- default:
- BUG();
- }
-}
-
-static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
-{
- switch (info->type) {
- case BCH_STR_HASH_crc32c:
- return ctx->crc32c;
- case BCH_STR_HASH_crc64:
- return ctx->crc64 >> 1;
- case BCH_STR_HASH_siphash_old:
- case BCH_STR_HASH_siphash:
- return SipHash24_End(&ctx->siphash) >> 1;
- default:
- BUG();
- }
-}
-
-struct bch_hash_desc {
- enum btree_id btree_id;
- u8 key_type;
-
- u64 (*hash_key)(const struct bch_hash_info *, const void *);
- u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
- bool (*cmp_key)(struct bkey_s_c, const void *);
- bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
- bool (*is_visible)(subvol_inum inum, struct bkey_s_c);
-};
-
-static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
-{
- return k.k->type == desc.key_type &&
- (!desc.is_visible ||
- !inum.inum ||
- desc.is_visible(inum, k));
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key,
- enum btree_iter_update_trigger_flags flags,
- u32 snapshot)
-{
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
- SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- POS(inum.inum, U64_MAX),
- BTREE_ITER_slots|flags, k, ret) {
- if (is_visible_key(desc, inum, k)) {
- if (!desc.cmp_key(k, key))
- return k;
- } else if (k.k->type == KEY_TYPE_hash_whiteout) {
- ;
- } else {
- /* hole, not found */
- break;
- }
- }
- bch2_trans_iter_exit(trans, iter);
-
- return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup);
-}
-
-static __always_inline struct bkey_s_c
-bch2_hash_lookup(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key,
- enum btree_iter_update_trigger_flags flags)
-{
- u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return bkey_s_c_err(ret);
-
- return bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
-}
-
-static __always_inline int
-bch2_hash_hole(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key)
-{
- struct bkey_s_c k;
- u32 snapshot;
- int ret;
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
- SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- POS(inum.inum, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret)
- if (!is_visible_key(desc, inum, k))
- return 0;
- bch2_trans_iter_exit(trans, iter);
-
- return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
-}
-
-static __always_inline
-int bch2_hash_needs_whiteout(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *start)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_copy_iter(trans, &iter, start);
-
- bch2_btree_iter_advance(trans, &iter);
-
- for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
- if (k.k->type != desc.key_type &&
- k.k->type != KEY_TYPE_hash_whiteout)
- break;
-
- if (k.k->type == desc.key_type &&
- desc.hash_bkey(info, k) <= start->pos.offset) {
- ret = 1;
- break;
- }
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static __always_inline
-struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, u32 snapshot,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter slot = {};
- struct bkey_s_c k;
- bool found = false;
- int ret;
-
- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id,
- SPOS(insert->k.p.inode,
- desc.hash_bkey(info, bkey_i_to_s_c(insert)),
- snapshot),
- POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
- if (is_visible_key(desc, inum, k)) {
- if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
- goto found;
-
- /* hash collision: */
- continue;
- }
-
- if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(trans, &slot, iter);
-
- if (k.k->type != KEY_TYPE_hash_whiteout)
- goto not_found;
- }
-
- if (!ret)
- ret = bch_err_throw(c, ENOSPC_str_hash_create);
-out:
- bch2_trans_iter_exit(trans, &slot);
- bch2_trans_iter_exit(trans, iter);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
-found:
- found = true;
-not_found:
- if (found && (flags & STR_HASH_must_create)) {
- bch2_trans_iter_exit(trans, &slot);
- return k;
- } else if (!found && (flags & STR_HASH_must_replace)) {
- ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
- } else {
- if (!found && slot.path)
- swap(*iter, slot);
-
- insert->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, insert, flags);
- }
-
- goto out;
-}
-
-static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, u32 snapshot,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
- snapshot, insert, flags);
- int ret = bkey_err(k);
- if (ret)
- return ret;
- if (k.k) {
- bch2_trans_iter_exit(trans, &iter);
- return bch_err_throw(trans->c, EEXIST_str_hash_set);
- }
-
- return 0;
-}
-
-static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum,
- struct bkey_i *insert,
- enum btree_iter_update_trigger_flags flags)
-{
- insert->k.p.inode = inum.inum;
-
- u32 snapshot;
- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
- bch2_hash_set_in_snapshot(trans, desc, info, inum,
- snapshot, insert, flags);
-}
-
-static __always_inline
-int bch2_hash_delete_at(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter,
- enum btree_iter_update_trigger_flags flags)
-{
- struct bkey_i *delete;
- int ret;
-
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- ret = PTR_ERR_OR_ZERO(delete);
- if (ret)
- return ret;
-
- ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
- if (ret < 0)
- return ret;
-
- bkey_init(&delete->k);
- delete->k.p = iter->pos;
- delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
-
- return bch2_trans_update(trans, iter, delete, flags);
-}
-
-static __always_inline
-int bch2_hash_delete(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum, const void *key)
-{
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
- BTREE_ITER_intent);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
-
-struct snapshots_seen;
-int bch2_str_hash_repair_key(struct btree_trans *,
- struct snapshots_seen *,
- const struct bch_hash_desc *,
- struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c,
- struct btree_iter *, struct bkey_s_c,
- bool *);
-
-int __bch2_str_hash_check_key(struct btree_trans *,
- struct snapshots_seen *,
- const struct bch_hash_desc *,
- struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c,
- bool *);
-
-static inline int bch2_str_hash_check_key(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc *desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k,
- bool *updated_before_k_pos)
-{
- if (hash_k.k->type != desc->key_type)
- return 0;
-
- if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
- return 0;
-
- return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
- updated_before_k_pos);
-}
-
-#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
deleted file mode 100644
index 020587449123..000000000000
--- a/fs/bcachefs/subvolume.c
+++ /dev/null
@@ -1,752 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "recovery_passes.h"
-#include "snapshot.h"
-#include "subvolume.h"
-
-#include <linux/random.h>
-
-static int bch2_subvolume_delete(struct btree_trans *, u32);
-
-static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
-{
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "missing subvolume %u", subvolid);
- bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
-
- int ret = bch2_run_explicit_recovery_pass(c, &buf,
- BCH_RECOVERY_PASS_check_inodes, 0);
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- return ret;
-}
-
-static struct bpos subvolume_children_pos(struct bkey_s_c k)
-{
- if (k.k->type != KEY_TYPE_subvolume)
- return POS_MIN;
-
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- if (!s.v->fs_path_parent)
- return POS_MIN;
- return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
-}
-
-static int check_subvol(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_subvolume subvol;
- struct btree_iter subvol_children_iter = {};
- struct bch_snapshot snapshot;
- struct printbuf buf = PRINTBUF;
- unsigned snapid;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- subvol = bkey_s_c_to_subvolume(k);
- snapid = le32_to_cpu(subvol.v->snapshot);
- ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
-
- if (bch2_err_matches(ret, ENOENT))
- return bch2_run_print_explicit_recovery_pass(c,
- BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
- if (ret)
- return ret;
-
- if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- return ret ?: -BCH_ERR_transaction_restart_nested;
- }
-
- if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
- subvol.v->fs_path_parent,
- trans, subvol_root_fs_path_parent_nonzero,
- "root subvolume has nonzero fs_path_parent\n%s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- struct bkey_i_subvolume *n =
- bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.fs_path_parent = 0;
- }
-
- if (subvol.v->fs_path_parent) {
- struct bpos pos = subvolume_children_pos(k);
-
- struct bkey_s_c subvol_children_k =
- bch2_bkey_get_iter(trans, &subvol_children_iter,
- BTREE_ID_subvolume_children, pos, 0);
- ret = bkey_err(subvol_children_k);
- if (ret)
- goto err;
-
- if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
- trans, subvol_children_not_set,
- "subvolume not set in subvolume_children btree at %llu:%llu\n%s",
- pos.inode, pos.offset,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
- if (ret)
- goto err;
- }
- }
-
- struct bch_inode_unpacked inode;
- ret = bch2_inode_find_by_inum_nowarn_trans(trans,
- (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
- &inode);
- if (!ret) {
- if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
- trans, subvol_root_wrong_bi_subvol,
- "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
- inode.bi_inum, inode.bi_snapshot,
- inode.bi_subvol, subvol.k->p.offset)) {
- inode.bi_subvol = subvol.k->p.offset;
- inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
- ret = __bch2_fsck_write_inode(trans, &inode);
- if (ret)
- goto err;
- }
- } else if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(trans, subvol_to_missing_root,
- "subvolume %llu points to missing subvolume root %llu:%u",
- k.k->p.offset, le64_to_cpu(subvol.v->inode),
- le32_to_cpu(subvol.v->snapshot))) {
- /*
- * Recreate - any contents that are still disconnected
- * will then get reattached under lost+found
- */
- bch2_inode_init_early(c, &inode);
- bch2_inode_init_late(c, &inode, bch2_current_time(c),
- 0, 0, S_IFDIR|0700, 0, NULL);
- inode.bi_inum = le64_to_cpu(subvol.v->inode);
- inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
- inode.bi_subvol = k.k->p.offset;
- inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent);
- ret = __bch2_fsck_write_inode(trans, &inode);
- if (ret)
- goto err;
- }
- } else {
- goto err;
- }
-
- if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
- u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
- u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
-
- struct bch_snapshot_tree st;
- ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
-
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "%s: snapshot tree %u not found", __func__, snapshot_tree);
-
- if (ret)
- goto err;
-
- if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
- trans, subvol_not_master_and_not_snapshot,
- "subvolume %llu is not set as snapshot but is not master subvolume",
- k.k->p.offset)) {
- struct bkey_i_subvolume *s =
- bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- goto err;
-
- SET_BCH_SUBVOLUME_SNAP(&s->v, true);
- }
- }
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &subvol_children_iter);
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_subvols(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol(trans, &iter, k)));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int check_subvol_child(struct btree_trans *trans,
- struct btree_iter *child_iter,
- struct bkey_s_c child_k)
-{
- struct bch_subvolume s;
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
- 0, subvolume, &s);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (fsck_err_on(ret ||
- le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
- trans, subvol_children_bad,
- "incorrect entry in subvolume_children btree %llu:%llu",
- child_k.k->p.inode, child_k.k->p.offset)) {
- ret = bch2_btree_delete_at(trans, child_iter, 0);
- if (ret)
- goto err;
- }
-err:
-fsck_err:
- return ret;
-}
-
-int bch2_check_subvol_children(struct bch_fs *c)
-{
- int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_subvol_child(trans, &iter, k)));
- bch_err_fn(c, ret);
- return 0;
-}
-
-/* Subvolumes: */
-
-int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
- int ret = 0;
-
- bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
- bkey_gt(k.k->p, SUBVOL_POS_MAX),
- c, subvol_pos_bad,
- "invalid pos");
-
- bkey_fsck_err_on(!subvol.v->snapshot,
- c, subvol_snapshot_bad,
- "invalid snapshot");
-
- bkey_fsck_err_on(!subvol.v->inode,
- c, subvol_inode_bad,
- "invalid inode");
-fsck_err:
- return ret;
-}
-
-void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
-
- prt_printf(out, "root %llu snapshot id %u",
- le64_to_cpu(s.v->inode),
- le32_to_cpu(s.v->snapshot));
-
- if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
- prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
- prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
- }
-
- if (BCH_SUBVOLUME_RO(s.v))
- prt_printf(out, " ro");
- if (BCH_SUBVOLUME_SNAP(s.v))
- prt_printf(out, " snapshot");
- if (BCH_SUBVOLUME_UNLINKED(s.v))
- prt_printf(out, " unlinked");
-}
-
-static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
-{
- return !bpos_eq(pos, POS_MIN)
- ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
- : 0;
-}
-
-int bch2_subvolume_trigger(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s new,
- enum btree_iter_update_trigger_flags flags)
-{
- if (flags & BTREE_TRIGGER_transactional) {
- struct bpos children_pos_old = subvolume_children_pos(old);
- struct bpos children_pos_new = subvolume_children_pos(new.s_c);
-
- if (!bpos_eq(children_pos_old, children_pos_new)) {
- int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
- subvolume_children_mod(trans, children_pos_new, true);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
-{
- struct btree_iter iter;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
- struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
- bch2_trans_iter_exit(trans, &iter);
-
- return bkey_err(k) ?: k.k && k.k->p.inode == subvol
- ? -BCH_ERR_ENOTEMPTY_subvol_not_empty
- : 0;
-}
-
-static __always_inline int
-bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
- bool inconsistent_if_not_found,
- struct bch_subvolume *s)
-{
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_cached|
- BTREE_ITER_with_updates, subvolume, s);
- if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found)
- ret = bch2_subvolume_missing(trans->c, subvol) ?: ret;
- return ret;
-}
-
-int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
- bool inconsistent_if_not_found,
- struct bch_subvolume *s)
-{
- return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s);
-}
-
-int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
-{
- struct bch_subvolume s;
- int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s);
- if (ret)
- return ret;
-
- if (BCH_SUBVOLUME_RO(&s))
- return -EROFS;
- return 0;
-}
-
-int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
-{
- return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
-}
-
-int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
- struct bch_subvolume *subvol)
-{
- struct bch_snapshot snap;
-
- return bch2_snapshot_lookup(trans, snapshot, &snap) ?:
- bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol);
-}
-
-int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
- u32 *snapid, bool warn)
-{
- struct btree_iter iter;
- struct bkey_s_c_subvolume subvol;
- int ret;
-
- subvol = bch2_bkey_get_iter_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_cached|BTREE_ITER_with_updates,
- subvolume);
- ret = bkey_err(subvol);
-
- if (bch2_err_matches(ret, ENOENT))
- ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
-
- if (likely(!ret))
- *snapid = le32_to_cpu(subvol.v->snapshot);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
- u32 *snapid)
-{
- return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
-}
-
-static int bch2_subvolume_reparent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- u32 old_parent, u32 new_parent)
-{
- struct bkey_i_subvolume *s;
- int ret;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
- le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
- return 0;
-
- s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- s->v.creation_parent = cpu_to_le32(new_parent);
- return 0;
-}
-
-/*
- * Separate from the snapshot tree in the snapshots btree, we record the tree
- * structure of how snapshot subvolumes were created - the parent subvolume of
- * each snapshot subvolume.
- *
- * When a subvolume is deleted, we scan for child subvolumes and reparant them,
- * to avoid dangling references:
- */
-static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
-{
- struct bch_subvolume s;
-
- return lockrestart_do(trans,
- bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?:
- for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_subvolume_reparent(trans, &iter, k,
- subvolid_to_delete, le32_to_cpu(s.creation_parent)));
-}
-
-/*
- * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
- * deletion/cleanup:
- */
-static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
- struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {};
-
- struct bkey_s_c_subvolume subvol =
- bch2_bkey_get_iter_typed(trans, &subvol_iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_cached|BTREE_ITER_intent,
- subvolume);
- int ret = bkey_err(subvol);
- if (bch2_err_matches(ret, ENOENT))
- ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
- if (ret)
- goto err;
-
- u32 snapid = le32_to_cpu(subvol.v->snapshot);
-
- struct bkey_s_c_snapshot snapshot =
- bch2_bkey_get_iter_typed(trans, &snapshot_iter,
- BTREE_ID_snapshots, POS(0, snapid),
- 0, snapshot);
- ret = bkey_err(snapshot);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing snapshot %u", snapid);
- if (ret)
- goto err;
-
- u32 treeid = le32_to_cpu(snapshot.v->tree);
-
- struct bkey_s_c_snapshot_tree snapshot_tree =
- bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
- BTREE_ID_snapshot_trees, POS(0, treeid),
- 0, snapshot_tree);
- ret = bkey_err(snapshot_tree);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing snapshot tree %u", treeid);
- if (ret)
- goto err;
-
- if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
- struct bkey_i_snapshot_tree *snapshot_tree_mut =
- bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter,
- &snapshot_tree.s_c,
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(snapshot_tree_mut);
- if (ret)
- goto err;
-
- snapshot_tree_mut->v.master_subvol = 0;
- }
-
- ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?:
- bch2_snapshot_node_set_deleted(trans, snapid);
-err:
- bch2_trans_iter_exit(trans, &snapshot_tree_iter);
- bch2_trans_iter_exit(trans, &snapshot_iter);
- bch2_trans_iter_exit(trans, &subvol_iter);
- return ret;
-}
-
-static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
-{
- int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_subvolume_delete(trans, subvolid));
-
- bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
- return ret;
-}
-
-static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- snapshot_wait_for_pagecache_and_delete_work);
- int ret = 0;
-
- while (!ret) {
- mutex_lock(&c->snapshots_unlinked_lock);
- snapshot_id_list s = c->snapshots_unlinked;
- darray_init(&c->snapshots_unlinked);
- mutex_unlock(&c->snapshots_unlinked_lock);
-
- if (!s.nr)
- break;
-
- bch2_evict_subvolume_inodes(c, &s);
-
- darray_for_each(s, id) {
- ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
- bch_err_msg(c, ret, "deleting subvolume %u", *id);
- if (ret)
- break;
- }
-
- darray_exit(&s);
- }
-
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
-}
-
-struct subvolume_unlink_hook {
- struct btree_trans_commit_hook h;
- u32 subvol;
-};
-
-static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *_h)
-{
- struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- mutex_lock(&c->snapshots_unlinked_lock);
- if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
- ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
- mutex_unlock(&c->snapshots_unlinked_lock);
-
- if (ret)
- return ret;
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache))
- return -EROFS;
-
- if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
- return 0;
-}
-
-int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
-{
- struct btree_iter iter;
- struct bkey_i_subvolume *n;
- struct subvolume_unlink_hook *h;
- int ret = 0;
-
- h = bch2_trans_kmalloc(trans, sizeof(*h));
- ret = PTR_ERR_OR_ZERO(h);
- if (ret)
- return ret;
-
- h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook;
- h->subvol = subvolid;
- bch2_trans_commit_hook(trans, &h->h);
-
- n = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, subvolid),
- BTREE_ITER_cached, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
- if (bch2_err_matches(ret, ENOENT))
- ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
- if (unlikely(ret))
- return ret;
-
- SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
- n->v.fs_path_parent = 0;
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
- u32 parent_subvolid,
- u32 src_subvolid,
- u32 *new_subvolid,
- u32 *new_snapshotid,
- bool ro)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter dst_iter, src_iter = {};
- struct bkey_i_subvolume *new_subvol = NULL;
- struct bkey_i_subvolume *src_subvol = NULL;
- u32 parent = 0, new_nodes[2], snapshot_subvols[2];
- int ret = 0;
-
- ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
- BTREE_ID_subvolumes, POS(0, U32_MAX));
- if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = bch_err_throw(c, ENOSPC_subvolume_create);
- if (ret)
- return ret;
-
- snapshot_subvols[0] = dst_iter.pos.offset;
- snapshot_subvols[1] = src_subvolid;
-
- if (src_subvolid) {
- /* Creating a snapshot: */
-
- src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
- BTREE_ID_subvolumes, POS(0, src_subvolid),
- BTREE_ITER_cached, subvolume);
- ret = PTR_ERR_OR_ZERO(src_subvol);
- if (bch2_err_matches(ret, ENOENT))
- ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret;
- if (unlikely(ret))
- goto err;
-
- parent = le32_to_cpu(src_subvol->v.snapshot);
- }
-
- ret = bch2_snapshot_node_create(trans, parent, new_nodes,
- snapshot_subvols,
- src_subvolid ? 2 : 1);
- if (ret)
- goto err;
-
- if (src_subvolid) {
- src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
- ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
- if (ret)
- goto err;
- }
-
- new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(new_subvol);
- if (ret)
- goto err;
-
- new_subvol->v.flags = 0;
- new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
- new_subvol->v.inode = cpu_to_le64(inode);
- new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
- new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
- new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
- new_subvol->v.otime.hi = 0;
-
- SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
- SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
-
- *new_subvolid = new_subvol->k.p.offset;
- *new_snapshotid = new_nodes[0];
-err:
- bch2_trans_iter_exit(trans, &src_iter);
- bch2_trans_iter_exit(trans, &dst_iter);
- return ret;
-}
-
-int bch2_initialize_subvolumes(struct bch_fs *c)
-{
- struct bkey_i_snapshot_tree root_tree;
- struct bkey_i_snapshot root_snapshot;
- struct bkey_i_subvolume root_volume;
- int ret;
-
- bkey_snapshot_tree_init(&root_tree.k_i);
- root_tree.k.p.offset = 1;
- root_tree.v.master_subvol = cpu_to_le32(1);
- root_tree.v.root_snapshot = cpu_to_le32(U32_MAX);
-
- bkey_snapshot_init(&root_snapshot.k_i);
- root_snapshot.k.p.offset = U32_MAX;
- root_snapshot.v.flags = 0;
- root_snapshot.v.parent = 0;
- root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
- root_snapshot.v.tree = cpu_to_le32(1);
- SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
- bkey_subvolume_init(&root_volume.k_i);
- root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
- root_volume.v.flags = 0;
- root_volume.v.snapshot = cpu_to_le32(U32_MAX);
- root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
-
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked inode;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!bkey_is_inode(k.k)) {
- struct bch_fs *c = trans->c;
- bch_err(c, "root inode not found");
- ret = bch_err_throw(c, ENOENT_inode);
- goto err;
- }
-
- ret = bch2_inode_unpack(k, &inode);
- BUG_ON(ret);
-
- inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
- ret = bch2_inode_write(trans, &iter, &inode);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/* set bi_subvol on root inode */
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_fs_upgrade_for_subvolumes(trans));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_fs_subvolumes_init_early(struct bch_fs *c)
-{
- INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
- bch2_subvolume_wait_for_pagecache_and_delete);
-}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
deleted file mode 100644
index 075f55e25c70..000000000000
--- a/fs/bcachefs/subvolume.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_H
-#define _BCACHEFS_SUBVOLUME_H
-
-#include "darray.h"
-#include "subvolume_types.h"
-
-int bch2_check_subvols(struct bch_fs *);
-int bch2_check_subvol_children(struct bch_fs *);
-
-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s,
- enum btree_iter_update_trigger_flags);
-
-#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
- .key_validate = bch2_subvolume_validate, \
- .val_to_text = bch2_subvolume_to_text, \
- .trigger = bch2_subvolume_trigger, \
- .min_val_size = 16, \
-})
-
-int bch2_subvol_has_children(struct btree_trans *, u32);
-int bch2_subvolume_get(struct btree_trans *, unsigned,
- bool, struct bch_subvolume *);
-int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
- u32 *, bool);
-int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
-
-int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
-int bch2_subvol_is_ro(struct bch_fs *, u32);
-
-static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end, u32 subvolid, unsigned flags)
-{
- u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
- if (ret)
- return bkey_s_c_err(ret);
-
- bch2_btree_iter_set_snapshot(trans, iter, snapshot);
- return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
-}
-
-#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
- _end, _subvolid, _flags, _k, _do) \
-({ \
- struct bkey_s_c _k; \
- int _ret3 = 0; \
- \
- do { \
- _ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
- _end, _subvolid, (_flags)); \
- if (!(_k).k) \
- break; \
- \
- bkey_err(_k) ?: (_do); \
- })); \
- } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
-
-#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \
- _start, _end, _subvolid, _flags, _k, _do) \
-({ \
- struct btree_iter _iter; \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
- _end, _subvolid, _flags, _k, _do); \
-})
-
-int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
-
-int bch2_initialize_subvolumes(struct bch_fs *);
-int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
-
-void bch2_fs_subvolumes_init_early(struct bch_fs *);
-
-#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
deleted file mode 100644
index e029df7ba89f..000000000000
--- a/fs/bcachefs/subvolume_format.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
-#define _BCACHEFS_SUBVOLUME_FORMAT_H
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- *
- * This is _not_ necessarily the subvolume of the directory containing
- * this subvolume:
- */
- __le32 creation_parent;
- __le32 fs_path_parent;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
deleted file mode 100644
index 9d634b906dcd..000000000000
--- a/fs/bcachefs/subvolume_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
-#define _BCACHEFS_SUBVOLUME_TYPES_H
-
-typedef struct {
- /* we can't have padding in this struct: */
- u64 subvol;
- u64 inum;
-} subvol_inum;
-
-#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
deleted file mode 100644
index 6c2e1d647403..000000000000
--- a/fs/bcachefs/super-io.c
+++ /dev/null
@@ -1,1562 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "checksum.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "journal.h"
-#include "journal_sb.h"
-#include "journal_seq_blacklist.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "quota.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-downgrade.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "super-io.h"
-#include "super.h"
-#include "trace.h"
-#include "vstructs.h"
-
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-#include <linux/string_choices.h>
-
-struct bch2_metadata_version {
- u16 version;
- const char *name;
-};
-
-static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v) { \
- .version = v, \
- .name = #n, \
-},
- BCH_METADATA_VERSIONS()
-#undef x
-};
-
-void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v)
-{
- const char *str = "(unknown version)";
-
- for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
- if (bch2_metadata_versions[i].version == v) {
- str = bch2_metadata_versions[i].name;
- break;
- }
-
- prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
-}
-
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v)
-{
- if (!BCH_VERSION_MAJOR(v))
- return v;
-
- for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
- if (bch2_metadata_versions[i].version > v &&
- BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
- BCH_VERSION_MAJOR(v))
- v = bch2_metadata_versions[i].version;
-
- return v;
-}
-
-int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
-{
- int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
- version <= c->sb.version_incompat_allowed)
- ? 0
- : -BCH_ERR_may_not_use_incompat_feature;
-
- mutex_lock(&c->sb_lock);
- if (!ret) {
- SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
- bch2_write_super(c);
- } else {
- darray_for_each(c->incompat_versions_requested, i)
- if (version == *i)
- goto out;
-
- darray_push(&c->incompat_versions_requested, version);
- struct printbuf buf = PRINTBUF;
- prt_str(&buf, "requested incompat feature ");
- bch2_version_to_text(&buf, version);
- prt_str(&buf, " currently not enabled, allowed up to ");
- bch2_version_to_text(&buf, version);
- prt_printf(&buf, "\n set version_upgrade=incompat to enable");
-
- bch_notice(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
-out:
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-const char * const bch2_sb_fields[] = {
-#define x(name, nr) #name,
- BCH_SB_FIELDS()
-#undef x
- NULL
-};
-
-static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
- enum bch_validate_flags, struct printbuf *);
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
- enum bch_sb_field_type type)
-{
- /* XXX: need locking around superblock to access optional fields */
-
- vstruct_for_each(sb, f)
- if (le32_to_cpu(f->type) == type)
- return f;
- return NULL;
-}
-
-static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
- struct bch_sb_field *f,
- unsigned u64s)
-{
- unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
-
- BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
-
- if (!f && !u64s) {
- /* nothing to do: */
- } else if (!f) {
- f = vstruct_last(sb->sb);
- memset(f, 0, sizeof(u64) * u64s);
- f->u64s = cpu_to_le32(u64s);
- f->type = 0;
- } else {
- void *src, *dst;
-
- src = vstruct_end(f);
-
- if (u64s) {
- f->u64s = cpu_to_le32(u64s);
- dst = vstruct_end(f);
- } else {
- dst = f;
- }
-
- memmove(dst, src, vstruct_end(sb->sb) - src);
-
- if (dst > src)
- memset(src, 0, dst - src);
- }
-
- sb->sb->u64s = cpu_to_le32(sb_u64s);
-
- return u64s ? f : NULL;
-}
-
-void bch2_sb_field_delete(struct bch_sb_handle *sb,
- enum bch_sb_field_type type)
-{
- struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
- if (f)
- __bch2_sb_field_resize(sb, f, 0);
-}
-
-/* Superblock realloc/free: */
-
-void bch2_free_super(struct bch_sb_handle *sb)
-{
- kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->s_bdev_file))
- bdev_fput(sb->s_bdev_file);
- kfree(sb->holder);
- kfree(sb->sb_name);
-
- kfree(sb->sb);
- memset(sb, 0, sizeof(*sb));
-}
-
-int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
-{
- size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
- size_t new_buffer_size;
- struct bch_sb *new_sb;
- struct bio *bio;
-
- if (sb->bdev)
- new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
-
- new_buffer_size = roundup_pow_of_two(new_bytes);
-
- if (sb->sb && sb->buffer_size >= new_buffer_size)
- return 0;
-
- if (sb->sb && sb->have_layout) {
- u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
- if (new_bytes > max_bytes) {
- struct printbuf buf = PRINTBUF;
-
- prt_bdevname(&buf, sb->bdev);
- prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_ENOSPC_sb;
- }
- }
-
- if (sb->buffer_size >= new_buffer_size && sb->sb)
- return 0;
-
- if (dynamic_fault("bcachefs:add:super_realloc"))
- return -BCH_ERR_ENOMEM_sb_realloc_injected;
-
- new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
- if (!new_sb)
- return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
- sb->sb = new_sb;
-
- if (sb->have_bio) {
- unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
-
- bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
- if (!bio)
- return -BCH_ERR_ENOMEM_sb_bio_realloc;
-
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
-
- kfree(sb->bio);
- sb->bio = bio;
- }
-
- sb->buffer_size = new_buffer_size;
-
- return 0;
-}
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
-
- if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
- return NULL;
-
- if (sb->fs_sb) {
- struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
-
- lockdep_assert_held(&c->sb_lock);
-
- /* XXX: we're not checking that offline device have enough space */
-
- for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) {
- struct bch_sb_handle *dev_sb = &ca->disk_sb;
-
- if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize);
- return NULL;
- }
- }
- }
-
- f = bch2_sb_field_get_id(sb->sb, type);
- f = __bch2_sb_field_resize(sb, f, u64s);
- if (f)
- f->type = cpu_to_le32(type);
- return f;
-}
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
-
- if (!f || le32_to_cpu(f->u64s) < u64s)
- f = bch2_sb_field_resize_id(sb, type, u64s);
- return f;
-}
-
-/* Superblock validate: */
-
-static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
-{
- u64 offset, prev_offset, max_sectors;
- unsigned i;
-
- BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-
- if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
- !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
- prt_printf(out, "Not a bcachefs superblock layout");
- return -BCH_ERR_invalid_sb_layout;
- }
-
- if (layout->layout_type != 0) {
- prt_printf(out, "Invalid superblock layout type %u",
- layout->layout_type);
- return -BCH_ERR_invalid_sb_layout_type;
- }
-
- if (!layout->nr_superblocks) {
- prt_printf(out, "Invalid superblock layout: no superblocks");
- return -BCH_ERR_invalid_sb_layout_nr_superblocks;
- }
-
- if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
- prt_printf(out, "Invalid superblock layout: too many superblocks");
- return -BCH_ERR_invalid_sb_layout_nr_superblocks;
- }
-
- if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
- prt_printf(out, "Invalid superblock layout: max_size_bits too high");
- return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
- }
-
- max_sectors = 1 << layout->sb_max_size_bits;
-
- prev_offset = le64_to_cpu(layout->sb_offset[0]);
-
- for (i = 1; i < layout->nr_superblocks; i++) {
- offset = le64_to_cpu(layout->sb_offset[i]);
-
- if (offset < prev_offset + max_sectors) {
- prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
- " (sb %u ends at %llu next starts at %llu",
- i - 1, prev_offset + max_sectors, offset);
- return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
- }
- prev_offset = offset;
- }
-
- return 0;
-}
-
-static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
-{
- u16 version = le16_to_cpu(sb->version);
- u16 version_min = le16_to_cpu(sb->version_min);
-
- if (!bch2_version_compatible(version)) {
- prt_str(out, "Unsupported superblock version ");
- bch2_version_to_text(out, version);
- prt_str(out, " (min ");
- bch2_version_to_text(out, bcachefs_metadata_version_min);
- prt_str(out, ", max ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- prt_str(out, ")");
- return -BCH_ERR_invalid_sb_version;
- }
-
- if (!bch2_version_compatible(version_min)) {
- prt_str(out, "Unsupported superblock version_min ");
- bch2_version_to_text(out, version_min);
- prt_str(out, " (min ");
- bch2_version_to_text(out, bcachefs_metadata_version_min);
- prt_str(out, ", max ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- prt_str(out, ")");
- return -BCH_ERR_invalid_sb_version;
- }
-
- if (version_min > version) {
- prt_str(out, "Bad minimum version ");
- bch2_version_to_text(out, version_min);
- prt_str(out, ", greater than version field ");
- bch2_version_to_text(out, version);
- return -BCH_ERR_invalid_sb_version;
- }
-
- return 0;
-}
-
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
- enum bch_validate_flags flags, struct printbuf *out)
-{
- enum bch_opt_id opt_id;
- int ret;
-
- ret = bch2_sb_compatible(sb, out);
- if (ret)
- return ret;
-
- u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
- unsigned incompat_bit = 0;
- if (incompat)
- incompat_bit = __ffs64(incompat);
- else if (sb->features[1])
- incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
- if (incompat_bit) {
- prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
- incompat_bit,
- bch2_sb_features[BCH_FEATURE_NR - 1],
- BCH_FEATURE_NR - 1);
- return -BCH_ERR_invalid_sb_features;
- }
-
- if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
- BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_str(out, "Filesystem has incompatible version ");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_str(out, ", current version ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- return -BCH_ERR_invalid_sb_features;
- }
-
- if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
- prt_printf(out, "Bad user UUID (got zeroes)");
- return -BCH_ERR_invalid_sb_uuid;
- }
-
- if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
- prt_printf(out, "Bad internal UUID (got zeroes)");
- return -BCH_ERR_invalid_sb_uuid;
- }
-
- if (!(flags & BCH_VALIDATE_write) &&
- le64_to_cpu(sb->offset) != read_offset) {
- prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
- le64_to_cpu(sb->offset), read_offset);
- return -BCH_ERR_invalid_sb_offset;
- }
-
- if (!sb->nr_devices ||
- sb->nr_devices > BCH_SB_MEMBERS_MAX) {
- prt_printf(out, "Bad number of member devices %u (max %u)",
- sb->nr_devices, BCH_SB_MEMBERS_MAX);
- return -BCH_ERR_invalid_sb_too_many_members;
- }
-
- if (sb->dev_idx >= sb->nr_devices) {
- prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
- sb->dev_idx, sb->nr_devices);
- return -BCH_ERR_invalid_sb_dev_idx;
- }
-
- if (!sb->time_precision ||
- le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
- prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
- le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
- return -BCH_ERR_invalid_sb_time_precision;
- }
-
- /* old versions didn't know to downgrade this field */
- if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version))
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version));
-
- if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) {
- prt_printf(out, "Invalid version_incompat ");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
- prt_str(out, " > incompat_allowed ");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
- if (flags & BCH_VALIDATE_write)
- return -BCH_ERR_invalid_sb_version;
- else
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
- }
-
- if (sb->nr_devices > 1)
- SET_BCH_SB_MULTI_DEVICE(sb, true);
-
- if (!flags) {
- /*
- * Been seeing a bug where these are getting inexplicably
- * zeroed, so we're now validating them, but we have to be
- * careful not to preven people's filesystems from mounting:
- */
- if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
- SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
- if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
- SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
-
- if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
- !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
- SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
- SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
-
- if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
- SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
- !BCH_SB_CSUM_ERR_RETRY_NR(sb))
- SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
- }
-
-#ifdef __KERNEL__
- if (!BCH_SB_SHARD_INUMS_NBITS(sb))
- SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
-#endif
-
- for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
- const struct bch_option *opt = bch2_opt_table + opt_id;
-
- if (opt->get_sb) {
- u64 v = bch2_opt_from_sb(sb, opt_id, -1);
-
- prt_printf(out, "Invalid option ");
- ret = bch2_opt_validate(opt, v, out);
- if (ret)
- return ret;
-
- printbuf_reset(out);
- }
- }
-
- /* validate layout */
- ret = validate_sb_layout(&sb->layout, out);
- if (ret)
- return ret;
-
- vstruct_for_each(sb, f) {
- if (!f->u64s) {
- prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
- le32_to_cpu(f->type));
- return -BCH_ERR_invalid_sb_field_size;
- }
-
- if (vstruct_next(f) > vstruct_last(sb)) {
- prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
- le32_to_cpu(f->type));
- return -BCH_ERR_invalid_sb_field_size;
- }
- }
-
- struct bch_sb_field *mi =
- bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?:
- bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1);
-
- /* members must be validated first: */
- if (!mi) {
- prt_printf(out, "Invalid superblock: member info area missing");
- return -BCH_ERR_invalid_sb_members_missing;
- }
-
- ret = bch2_sb_field_validate(sb, mi, flags, out);
- if (ret)
- return ret;
-
- vstruct_for_each(sb, f) {
- if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
- continue;
-
- ret = bch2_sb_field_validate(sb, f, flags, out);
- if (ret)
- return ret;
- }
-
- if ((flags & BCH_VALIDATE_write) &&
- bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
- prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
- le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
- le64_to_cpu(sb->seq));
- return -BCH_ERR_invalid_sb_members_missing;
- }
-
- return 0;
-}
-
-/* device open: */
-
-static unsigned long le_ulong_to_cpu(unsigned long v)
-{
- return sizeof(unsigned long) == 8
- ? le64_to_cpu(v)
- : le32_to_cpu(v);
-}
-
-static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
-{
- BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
-
- for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
- dst[i] = le_ulong_to_cpu(src[i]);
-}
-
-static void bch2_sb_update(struct bch_fs *c)
-{
- struct bch_sb *src = c->disk_sb.sb;
-
- lockdep_assert_held(&c->sb_lock);
-
- c->sb.uuid = src->uuid;
- c->sb.user_uuid = src->user_uuid;
- c->sb.version = le16_to_cpu(src->version);
- c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src);
- c->sb.version_incompat_allowed
- = BCH_SB_VERSION_INCOMPAT_ALLOWED(src);
- c->sb.version_min = le16_to_cpu(src->version_min);
- c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
- c->sb.nr_devices = src->nr_devices;
- c->sb.clean = BCH_SB_CLEAN(src);
- c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
-
- c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
- c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
-
- /* XXX this is wrong, we need a 96 or 128 bit integer type */
- c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
- c->sb.nsec_per_time_unit);
- c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
-
- c->sb.features = le64_to_cpu(src->features[0]);
- c->sb.compat = le64_to_cpu(src->compat[0]);
- c->sb.multi_device = BCH_SB_MULTI_DEVICE(src);
-
- memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
- if (ext) {
- c->sb.recovery_passes_required =
- bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
- le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
- sizeof(c->sb.errors_silent) * 8);
- c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
- }
-
- for_each_member_device(c, ca) {
- struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
- ca->mi = bch2_mi_to_cpu(&m);
- }
-}
-
-static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
-{
- struct bch_sb_field *src_f, *dst_f;
- struct bch_sb *dst = dst_handle->sb;
- unsigned i;
-
- dst->version = src->version;
- dst->version_min = src->version_min;
- dst->seq = src->seq;
- dst->uuid = src->uuid;
- dst->user_uuid = src->user_uuid;
- memcpy(dst->label, src->label, sizeof(dst->label));
-
- dst->block_size = src->block_size;
- dst->nr_devices = src->nr_devices;
-
- dst->time_base_lo = src->time_base_lo;
- dst->time_base_hi = src->time_base_hi;
- dst->time_precision = src->time_precision;
- dst->write_time = src->write_time;
-
- memcpy(dst->flags, src->flags, sizeof(dst->flags));
- memcpy(dst->features, src->features, sizeof(dst->features));
- memcpy(dst->compat, src->compat, sizeof(dst->compat));
-
- for (i = 0; i < BCH_SB_FIELD_NR; i++) {
- int d;
-
- if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
- continue;
-
- src_f = bch2_sb_field_get_id(src, i);
- dst_f = bch2_sb_field_get_id(dst, i);
-
- d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
- (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
- if (d > 0) {
- int ret = bch2_sb_realloc(dst_handle,
- le32_to_cpu(dst_handle->sb->u64s) + d);
-
- if (ret)
- return ret;
-
- dst = dst_handle->sb;
- dst_f = bch2_sb_field_get_id(dst, i);
- }
-
- dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
- src_f ? le32_to_cpu(src_f->u64s) : 0);
-
- if (src_f)
- memcpy(dst_f, src_f, vstruct_bytes(src_f));
- }
-
- return 0;
-}
-
-int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
-{
- int ret;
-
- lockdep_assert_held(&c->sb_lock);
-
- ret = bch2_sb_realloc(&c->disk_sb, 0) ?:
- __copy_super(&c->disk_sb, src) ?:
- bch2_sb_replicas_to_cpu_replicas(c) ?:
- bch2_sb_disk_groups_to_cpu(c);
- if (ret)
- return ret;
-
- bch2_sb_update(c);
- return 0;
-}
-
-int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
-{
- return __copy_super(&ca->disk_sb, c->disk_sb.sb);
-}
-
-/* read superblock: */
-
-static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
-{
- size_t bytes;
- int ret;
-reread:
- bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
- sb->bio->bi_iter.bi_sector = offset;
- bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
-
- ret = submit_bio_wait(sb->bio);
- if (ret) {
- prt_printf(err, "IO error: %i", ret);
- return ret;
- }
-
- if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
- !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
- prt_str(err, "Not a bcachefs superblock (got magic ");
- pr_uuid(err, sb->sb->magic.b);
- prt_str(err, ")");
- return -BCH_ERR_invalid_sb_magic;
- }
-
- ret = bch2_sb_compatible(sb->sb, err);
- if (ret)
- return ret;
-
- bytes = vstruct_bytes(sb->sb);
-
- u64 sb_size = 512ULL << min(BCH_SB_LAYOUT_SIZE_BITS_MAX, sb->sb->layout.sb_max_size_bits);
- if (bytes > sb_size) {
- prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %llu)",
- bytes, sb_size);
- return -BCH_ERR_invalid_sb_too_big;
- }
-
- if (bytes > sb->buffer_size) {
- ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
- if (ret)
- return ret;
- goto reread;
- }
-
- enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
- if (csum_type >= BCH_CSUM_NR ||
- bch2_csum_type_is_encryption(csum_type)) {
- prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
- return -BCH_ERR_invalid_sb_csum_type;
- }
-
- /* XXX: verify MACs */
- struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
- if (bch2_crc_cmp(csum, sb->sb->csum)) {
- bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
- return -BCH_ERR_invalid_sb_csum;
- }
-
- sb->seq = le64_to_cpu(sb->sb->seq);
-
- return 0;
-}
-
-static int __bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
-{
- u64 offset = opt_get(*opts, sb);
- struct bch_sb_layout layout;
- struct printbuf err = PRINTBUF;
- struct printbuf err2 = PRINTBUF;
- __le64 *i;
- int ret;
-#ifndef __KERNEL__
-retry:
-#endif
- memset(sb, 0, sizeof(*sb));
- sb->mode = BLK_OPEN_READ;
- sb->have_bio = true;
- sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
- if (!sb->holder)
- return -ENOMEM;
-
- sb->sb_name = kstrdup(path, GFP_KERNEL);
- if (!sb->sb_name) {
- ret = -ENOMEM;
- prt_printf(&err, "error allocating memory for sb_name");
- goto err;
- }
-
-#ifndef __KERNEL__
- if (opt_get(*opts, direct_io) == false)
- sb->mode |= BLK_OPEN_BUFFERED;
-#endif
-
- if (!opt_get(*opts, noexcl))
- sb->mode |= BLK_OPEN_EXCL;
-
- if (!opt_get(*opts, nochanges))
- sb->mode |= BLK_OPEN_WRITE;
-
- sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->s_bdev_file) &&
- PTR_ERR(sb->s_bdev_file) == -EACCES &&
- opt_get(*opts, read_only)) {
- sb->mode &= ~BLK_OPEN_WRITE;
-
- sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->s_bdev_file))
- opt_set(*opts, nochanges, true);
- }
-
- if (IS_ERR(sb->s_bdev_file)) {
- ret = PTR_ERR(sb->s_bdev_file);
- prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
- goto err;
- }
- sb->bdev = file_bdev(sb->s_bdev_file);
-
- ret = bch2_sb_realloc(sb, 0);
- if (ret) {
- prt_printf(&err, "error allocating memory for superblock");
- goto err;
- }
-
- if (bch2_fs_init_fault("read_super")) {
- prt_printf(&err, "dynamic fault");
- ret = -EFAULT;
- goto err;
- }
-
- ret = read_one_super(sb, offset, &err);
- if (!ret)
- goto got_super;
-
- if (opt_defined(*opts, sb))
- goto err;
-
- prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
- path, err.buf);
- if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
- bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
- else
- bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
-
- printbuf_exit(&err2);
- printbuf_reset(&err);
-
- /*
- * Error reading primary superblock - read location of backup
- * superblocks:
- */
- bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
- sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- /*
- * use sb buffer to read layout, since sb buffer is page aligned but
- * layout won't be:
- */
- bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
-
- ret = submit_bio_wait(sb->bio);
- if (ret) {
- prt_printf(&err, "IO error: %i", ret);
- goto err;
- }
-
- memcpy(&layout, sb->sb, sizeof(layout));
- ret = validate_sb_layout(&layout, &err);
- if (ret)
- goto err;
-
- for (i = layout.sb_offset;
- i < layout.sb_offset + layout.nr_superblocks; i++) {
- offset = le64_to_cpu(*i);
-
- if (offset == opt_get(*opts, sb)) {
- ret = -BCH_ERR_invalid;
- continue;
- }
-
- ret = read_one_super(sb, offset, &err);
- if (!ret)
- goto got_super;
- }
-
- goto err;
-
-got_super:
- if (le16_to_cpu(sb->sb->block_size) << 9 <
- bdev_logical_block_size(sb->bdev) &&
- opt_get(*opts, direct_io)) {
-#ifndef __KERNEL__
- opt_set(*opts, direct_io, false);
- bch2_free_super(sb);
- goto retry;
-#endif
- prt_printf(&err, "block size (%u) smaller than device block size (%u)",
- le16_to_cpu(sb->sb->block_size) << 9,
- bdev_logical_block_size(sb->bdev));
- ret = -BCH_ERR_block_size_too_small;
- goto err;
- }
-
- sb->have_layout = true;
-
- ret = bch2_sb_validate(sb->sb, offset, 0, &err);
- if (ret) {
- bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
- path, err.buf);
- goto err_no_print;
- }
-out:
- printbuf_exit(&err);
- return ret;
-err:
- bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
- path, err.buf);
-err_no_print:
- bch2_free_super(sb);
- goto out;
-}
-
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
-{
- return __bch2_read_super(path, opts, sb, false);
-}
-
-/* provide a silenced version for mount.bcachefs */
-
-int bch2_read_super_silent(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
-{
- return __bch2_read_super(path, opts, sb, true);
-}
-
-/* write superblock: */
-
-static void write_super_endio(struct bio *bio)
-{
- struct bch_dev *ca = bio->bi_private;
-
- bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
-
- /* XXX: return errors directly */
-
- if (bio->bi_status) {
- bch_err_dev_ratelimited(ca, "superblock %s error: %s",
- str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status));
- ca->sb_write_error = 1;
- }
-
- closure_put(&ca->fs->sb_write);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
-}
-
-static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bch_sb *sb = ca->disk_sb.sb;
- struct bio *bio = ca->disk_sb.bio;
-
- memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
- bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE);
-
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
-
- enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
- closure_bio_submit(bio, &c->sb_write);
-}
-
-static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
-{
- struct bch_sb *sb = ca->disk_sb.sb;
- struct bio *bio = ca->disk_sb.bio;
-
- sb->offset = sb->layout.sb_offset[idx];
-
- SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
- sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
- null_nonce(), sb);
-
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
- bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
- bch2_bio_map(bio, sb,
- roundup((size_t) vstruct_bytes(sb),
- bdev_logical_block_size(ca->disk_sb.bdev)));
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
- bio_sectors(bio));
-
- enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
- closure_bio_submit(bio, &c->sb_write);
-}
-
-int bch2_write_super(struct bch_fs *c)
-{
- struct closure *cl = &c->sb_write;
- struct printbuf err = PRINTBUF;
- unsigned sb = 0, nr_wrote;
- struct bch_devs_mask sb_written;
- bool wrote, can_mount_without_written, can_mount_with_written;
- unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
- DARRAY(struct bch_dev *) online_devices = {};
- int ret = 0;
-
- trace_and_count(c, write_super, c, _RET_IP_);
-
- if (c->opts.degraded == BCH_DEGRADED_very)
- degraded_flags |= BCH_FORCE_IF_LOST;
-
- lockdep_assert_held(&c->sb_lock);
-
- closure_init_stack(cl);
- memset(&sb_written, 0, sizeof(sb_written));
-
- /*
- * Note: we do writes to RO devices here, and we might want to change
- * that in the future.
- *
- * For now, we expect to be able to call write_super() when we're not
- * yet RW:
- */
- for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) {
- ret = darray_push(&online_devices, ca);
- if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
- goto out;
- }
- enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
- }
-
- /* Make sure we're using the new magic numbers: */
- c->disk_sb.sb->magic = BCHFS_MAGIC;
- c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
-
- le64_add_cpu(&c->disk_sb.sb->seq, 1);
-
- struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- darray_for_each(online_devices, ca)
- __bch2_members_v2_get_mut(mi, (*ca)->dev_idx)->seq = c->disk_sb.sb->seq;
- c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
-
- if (test_bit(BCH_FS_error, &c->flags))
- SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
- if (test_bit(BCH_FS_topology_error, &c->flags))
- SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
-
- SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
-
- bch2_sb_counters_from_cpu(c);
- bch2_sb_members_from_cpu(c);
- bch2_sb_members_cpy_v2_v1(&c->disk_sb);
- bch2_sb_errors_from_cpu(c);
- bch2_sb_downgrade_update(c);
-
- darray_for_each(online_devices, ca)
- bch2_sb_from_fs(c, (*ca));
-
- darray_for_each(online_devices, ca) {
- printbuf_reset(&err);
-
- ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
- if (ret) {
- bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
- goto out;
- }
- }
-
- if (c->opts.nochanges)
- goto out;
-
- /*
- * Defer writing the superblock until filesystem initialization is
- * complete - don't write out a partly initialized superblock:
- */
- if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
- goto out;
-
- if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
- struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
- bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
- prt_str(&buf, " > ");
- bch2_version_to_text(&buf, bcachefs_metadata_version_current);
- prt_str(&buf, ")");
- bch2_fs_fatal_error(c, ": %s", buf.buf);
- printbuf_exit(&buf);
- ret = bch_err_throw(c, sb_not_downgraded);
- goto out;
- }
-
- darray_for_each(online_devices, ca) {
- __set_bit((*ca)->dev_idx, sb_written.d);
- (*ca)->sb_write_error = 0;
- }
-
- darray_for_each(online_devices, ca)
- read_back_super(c, *ca);
- closure_sync(cl);
-
- darray_for_each(online_devices, cap) {
- struct bch_dev *ca = *cap;
-
- if (ca->sb_write_error)
- continue;
-
- if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
- struct printbuf buf = PRINTBUF;
- prt_char(&buf, ' ');
- prt_bdevname(&buf, ca->disk_sb.bdev);
- prt_printf(&buf,
- ": Superblock write was silently dropped! (seq %llu expected %llu)",
- le64_to_cpu(ca->sb_read_scratch->seq),
- ca->disk_sb.seq);
-
- if (c->opts.errors != BCH_ON_ERROR_continue &&
- c->opts.errors != BCH_ON_ERROR_fix_safe) {
- ret = bch_err_throw(c, erofs_sb_err);
- bch2_fs_fatal_error(c, "%s", buf.buf);
- } else {
- bch_err(c, "%s", buf.buf);
- }
-
- printbuf_exit(&buf);
- }
-
- if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
- struct printbuf buf = PRINTBUF;
- prt_char(&buf, ' ');
- prt_bdevname(&buf, ca->disk_sb.bdev);
- prt_printf(&buf,
- ": Superblock modified by another process (seq %llu expected %llu)",
- le64_to_cpu(ca->sb_read_scratch->seq),
- ca->disk_sb.seq);
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = bch_err_throw(c, erofs_sb_err);
- }
- }
-
- if (ret)
- goto out;
-
- do {
- wrote = false;
- darray_for_each(online_devices, cap) {
- struct bch_dev *ca = *cap;
- if (!ca->sb_write_error &&
- sb < ca->disk_sb.sb->layout.nr_superblocks) {
- write_one_super(c, ca, sb);
- wrote = true;
- }
- }
- closure_sync(cl);
- sb++;
- } while (wrote);
-
- darray_for_each(online_devices, cap) {
- struct bch_dev *ca = *cap;
- if (ca->sb_write_error)
- __clear_bit(ca->dev_idx, sb_written.d);
- else
- ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
- }
-
- nr_wrote = dev_mask_nr(&sb_written);
-
- can_mount_with_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
- for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
- sb_written.d[i] = ~sb_written.d[i];
-
- can_mount_without_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
-
- /*
- * If we would be able to mount _without_ the devices we successfully
- * wrote superblocks to, we weren't able to write to enough devices:
- *
- * Exception: if we can mount without the successes because we haven't
- * written anything (new filesystem), we continue if we'd be able to
- * mount with the devices we did successfully write to:
- */
- if (bch2_fs_fatal_err_on(!nr_wrote ||
- !can_mount_with_written ||
- (can_mount_without_written &&
- !can_mount_with_written), c,
- ": Unable to write superblock to sufficient devices (from %ps)",
- (void *) _RET_IP_))
- ret = bch_err_throw(c, erofs_sb_err);
-out:
- /* Make new options visible after they're persistent: */
- bch2_sb_update(c);
- darray_for_each(online_devices, ca)
- enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super);
- darray_exit(&online_devices);
- printbuf_exit(&err);
- return ret;
-}
-
-void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
- mutex_lock(&c->sb_lock);
- if (!(c->sb.features & (1ULL << feat))) {
- c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
-
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
-}
-
-/* Downgrade if superblock is at a higher version than currently supported: */
-bool bch2_check_version_downgrade(struct bch_fs *c)
-{
- bool ret = bcachefs_metadata_version_current < c->sb.version;
-
- lockdep_assert_held(&c->sb_lock);
-
- /*
- * Downgrade, if superblock is at a higher version than currently
- * supported:
- *
- * c->sb will be checked before we write the superblock, so update it as
- * well:
- */
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current)
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (c->sb.version > bcachefs_metadata_version_current)
- c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- if (c->sb.version_min > bcachefs_metadata_version_current)
- c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
- c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
- return ret;
-}
-
-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
-{
- lockdep_assert_held(&c->sb_lock);
-
- if (BCH_VERSION_MAJOR(new_version) >
- BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
- bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
-
- c->disk_sb.sb->version = cpu_to_le16(new_version);
-
- if (incompat) {
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
- }
-}
-
-void bch2_sb_upgrade_incompat(struct bch_fs *c)
-{
- mutex_lock(&c->sb_lock);
- if (c->sb.version == c->sb.version_incompat_allowed)
- goto unlock;
-
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Now allowing incompatible features up to ");
- bch2_version_to_text(&buf, c->sb.version);
- prt_str(&buf, ", previously allowed up to ");
- bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
- prt_newline(&buf);
-
- bch_notice(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
- SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
- bch2_write_super(c);
-unlock:
- mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- if (vstruct_bytes(f) < 88) {
- prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
- return -BCH_ERR_invalid_sb_ext;
- }
-
- return 0;
-}
-
-static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_ext *e = field_to_type(f, ext);
-
- prt_printf(out, "Recovery passes required:\t");
- prt_bitflags(out, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
- prt_newline(out);
-
- unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
- if (errors_silent) {
- le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
-
- prt_printf(out, "Errors to silently fix:\t");
- prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
- min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
- prt_newline(out);
-
- kfree(errors_silent);
- }
-
- prt_printf(out, "Btrees with missing data:\t");
- prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
- prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
- .validate = bch2_sb_ext_validate,
- .to_text = bch2_sb_ext_to_text,
-};
-
-static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
-#define x(f, nr) \
- [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
- BCH_SB_FIELDS()
-#undef x
-};
-
-static const struct bch_sb_field_ops bch2_sb_field_null_ops;
-
-static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
-{
- return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
- ? bch2_sb_field_ops[type]
- : &bch2_sb_field_null_ops;
-}
-
-static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- enum bch_validate_flags flags, struct printbuf *err)
-{
- unsigned type = le32_to_cpu(f->type);
- struct printbuf field_err = PRINTBUF;
- const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
- int ret;
-
- ret = ops->validate ? ops->validate(sb, f, flags, &field_err) : 0;
- if (ret) {
- prt_printf(err, "Invalid superblock section %s: %s",
- bch2_sb_fields[type], field_err.buf);
- prt_newline(err);
- bch2_sb_field_to_text(err, sb, f);
- }
-
- printbuf_exit(&field_err);
- return ret;
-}
-
-void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- unsigned type = le32_to_cpu(f->type);
- const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
-
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 32);
-
- if (ops->to_text)
- ops->to_text(out, sb, f);
-}
-
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- unsigned type = le32_to_cpu(f->type);
-
- if (type < BCH_SB_FIELD_NR)
- prt_printf(out, "%s", bch2_sb_fields[type]);
- else
- prt_printf(out, "(unknown field %u)", type);
-
- prt_printf(out, " (size %zu):", vstruct_bytes(f));
- prt_newline(out);
-
- __bch2_sb_field_to_text(out, sb, f);
-}
-
-void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
-{
- unsigned i;
-
- prt_printf(out, "Type: %u", l->layout_type);
- prt_newline(out);
-
- prt_str(out, "Superblock max size: ");
- prt_units_u64(out, 512 << l->sb_max_size_bits);
- prt_newline(out);
-
- prt_printf(out, "Nr superblocks: %u", l->nr_superblocks);
- prt_newline(out);
-
- prt_str(out, "Offsets: ");
- for (i = 0; i < l->nr_superblocks; i++) {
- if (i)
- prt_str(out, ", ");
- prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
- }
- prt_newline(out);
-}
-
-void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
- bool print_layout, unsigned fields)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 44);
-
- prt_printf(out, "External UUID:\t");
- pr_uuid(out, sb->user_uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Internal UUID:\t");
- pr_uuid(out, sb->uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Magic number:\t");
- pr_uuid(out, sb->magic.b);
- prt_newline(out);
-
- prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
-
- prt_printf(out, "Label:\t");
- if (!strlen(sb->label))
- prt_printf(out, "(none)");
- else
- prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
- prt_newline(out);
-
- prt_printf(out, "Version:\t");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_newline(out);
-
- prt_printf(out, "Incompatible features allowed:\t");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb));
- prt_newline(out);
-
- prt_printf(out, "Incompatible features in use:\t");
- bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb));
- prt_newline(out);
-
- prt_printf(out, "Version upgrade complete:\t");
- bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
- prt_newline(out);
-
- prt_printf(out, "Oldest version on disk:\t");
- bch2_version_to_text(out, le16_to_cpu(sb->version_min));
- prt_newline(out);
-
- prt_printf(out, "Created:\t");
- if (sb->time_base_lo)
- bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
- else
- prt_printf(out, "(not set)");
- prt_newline(out);
-
- prt_printf(out, "Sequence number:\t");
- prt_printf(out, "%llu", le64_to_cpu(sb->seq));
- prt_newline(out);
-
- prt_printf(out, "Time of last write:\t");
- bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
- prt_newline(out);
-
- prt_printf(out, "Superblock size:\t");
- prt_units_u64(out, vstruct_bytes(sb));
- prt_str(out, "/");
- prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
- prt_newline(out);
-
- prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
- prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
-
- prt_printf(out, "Sections:\t");
- u64 fields_have = 0;
- vstruct_for_each(sb, f)
- fields_have |= 1 << le32_to_cpu(f->type);
- prt_bitflags(out, bch2_sb_fields, fields_have);
- prt_newline(out);
-
- prt_printf(out, "Features:\t");
- prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
- prt_newline(out);
-
- prt_printf(out, "Compat features:\t");
- prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
- prt_newline(out);
-
- prt_newline(out);
- prt_printf(out, "Options:");
- prt_newline(out);
- printbuf_indent_add(out, 2);
- {
- enum bch_opt_id id;
-
- for (id = 0; id < bch2_opts_nr; id++) {
- const struct bch_option *opt = bch2_opt_table + id;
-
- if (opt->get_sb) {
- u64 v = bch2_opt_from_sb(sb, id, -1);
-
- prt_printf(out, "%s:\t", opt->attr.name);
- bch2_opt_to_text(out, NULL, sb, opt, v,
- OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
- prt_newline(out);
- }
- }
- }
-
- printbuf_indent_sub(out, 2);
-
- if (print_layout) {
- prt_newline(out);
- prt_printf(out, "layout:");
- prt_newline(out);
- printbuf_indent_add(out, 2);
- bch2_sb_layout_to_text(out, &sb->layout);
- printbuf_indent_sub(out, 2);
- }
-
- vstruct_for_each(sb, f)
- if (fields & (1 << le32_to_cpu(f->type))) {
- prt_newline(out);
- bch2_sb_field_to_text(out, sb, f);
- }
-}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
deleted file mode 100644
index a3b7a90f2533..000000000000
--- a/fs/bcachefs/super-io.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_IO_H
-#define _BCACHEFS_SUPER_IO_H
-
-#include "extents.h"
-#include "eytzinger.h"
-#include "super_types.h"
-#include "super.h"
-#include "sb-members.h"
-
-#include <asm/byteorder.h>
-
-#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096
-
-static inline bool bch2_version_compatible(u16 version)
-{
- return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
- version >= bcachefs_metadata_version_min;
-}
-
-void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
-enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-
-int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-
-static inline int bch2_request_incompat_feature(struct bch_fs *c,
- enum bcachefs_metadata_version version)
-{
- return likely(version <= c->sb.version_incompat)
- ? 0
- : bch2_set_version_incompat(c, version);
-}
-
-static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
-{
- return le32_to_cpu(f->u64s) * sizeof(u64);
-}
-
-#define field_to_type(_f, _name) \
- container_of_or_null(_f, struct bch_sb_field_##_name, field)
-
-struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
-#define bch2_sb_field_get(_sb, _name) \
- field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
-
-struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
- enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_resize(_sb, _name, _u64s) \
- field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
- enum bch_sb_field_type, unsigned);
-#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \
- field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-
-#define bch2_sb_field_nr_entries(_f) \
- (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \
- sizeof(_f->entries[0])) \
- : 0)
-
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
-
-extern const char * const bch2_sb_fields[];
-
-struct bch_sb_field_ops {
- int (*validate)(struct bch_sb *, struct bch_sb_field *,
- enum bch_validate_flags, struct printbuf *);
- void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
-};
-
-static inline __le64 bch2_sb_magic(struct bch_fs *c)
-{
- __le64 ret;
-
- memcpy(&ret, &c->sb.uuid, sizeof(ret));
- return ret;
-}
-
-static inline __u64 jset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
-}
-
-static inline __u64 bset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
-}
-
-int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
-int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
-
-void bch2_free_super(struct bch_sb_handle *);
-int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
-
-int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
-int bch2_write_super(struct bch_fs *);
-void __bch2_check_set_feature(struct bch_fs *, unsigned);
-
-static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
-{
- if (!(c->sb.features & (1ULL << feat)))
- __bch2_check_set_feature(c, feat);
-}
-
-bool bch2_check_version_downgrade(struct bch_fs *);
-void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
-void bch2_sb_upgrade_incompat(struct bch_fs *);
-
-void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
- struct bch_sb_field *);
-void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
- struct bch_sb_field *);
-void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
-void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
-
-#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
deleted file mode 100644
index c46b1053a02c..000000000000
--- a/fs/bcachefs/super.c
+++ /dev/null
@@ -1,2547 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcachefs setup/teardown code, and some metadata io - read a superblock and
- * figure out what to do with it.
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "async_objs.h"
-#include "backpointers.h"
-#include "bkey_sort.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_journal_iter.h"
-#include "btree_key_cache.h"
-#include "btree_node_scan.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "btree_write_buffer.h"
-#include "buckets_waiting_for_journal.h"
-#include "chardev.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "errcode.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-io.h"
-#include "fs-io-buffered.h"
-#include "fs-io-direct.h"
-#include "fsck.h"
-#include "inode.h"
-#include "io_read.h"
-#include "io_write.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
-#include "move.h"
-#include "migrate.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "quota.h"
-#include "rebalance.h"
-#include "recovery.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-clean.h"
-#include "sb-counters.h"
-#include "sb-errors.h"
-#include "sb-members.h"
-#include "snapshot.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "sysfs.h"
-#include "thread_with_file.h"
-#include "trace.h"
-
-#include <linux/backing-dev.h>
-#include <linux/blkdev.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/idr.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/random.h>
-#include <linux/sysfs.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-MODULE_DESCRIPTION("bcachefs filesystem");
-
-typedef DARRAY(struct bch_sb_handle) bch_sb_handles;
-
-#define x(n) #n,
-const char * const bch2_fs_flag_strs[] = {
- BCH_FS_FLAGS()
- NULL
-};
-
-const char * const bch2_write_refs[] = {
- BCH_WRITE_REFS()
- NULL
-};
-
-const char * const bch2_dev_read_refs[] = {
- BCH_DEV_READ_REFS()
- NULL
-};
-
-const char * const bch2_dev_write_refs[] = {
- BCH_DEV_WRITE_REFS()
- NULL
-};
-#undef x
-
-static void __bch2_print_str(struct bch_fs *c, const char *prefix,
- const char *str)
-{
-#ifdef __KERNEL__
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
- if (unlikely(stdio)) {
- bch2_stdio_redirect_printf(stdio, true, "%s", str);
- return;
- }
-#endif
- bch2_print_string_as_lines(KERN_ERR, str);
-}
-
-void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
-{
- __bch2_print_str(c, prefix, str);
-}
-
-__printf(2, 0)
-static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
-{
-#ifdef __KERNEL__
- if (unlikely(stdio)) {
- if (fmt[0] == KERN_SOH[0])
- fmt += 2;
-
- bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
- return;
- }
-#endif
- vprintk(fmt, args);
-}
-
-void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
-{
- struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
-
- va_list args;
- va_start(args, fmt);
- bch2_print_maybe_redirect(stdio, fmt, args);
- va_end(args);
-}
-
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
-{
- struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
-
- va_list args;
- va_start(args, fmt);
- bch2_print_maybe_redirect(stdio, fmt, args);
- va_end(args);
-}
-
-#define KTYPE(type) \
-static const struct attribute_group type ## _group = { \
- .attrs = type ## _files \
-}; \
- \
-static const struct attribute_group *type ## _groups[] = { \
- &type ## _group, \
- NULL \
-}; \
- \
-static const struct kobj_type type ## _ktype = { \
- .release = type ## _release, \
- .sysfs_ops = &type ## _sysfs_ops, \
- .default_groups = type ## _groups \
-}
-
-static void bch2_fs_release(struct kobject *);
-static void bch2_dev_release(struct kobject *);
-static void bch2_fs_counters_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_internal_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_opts_dir_release(struct kobject *k)
-{
-}
-
-static void bch2_fs_time_stats_release(struct kobject *k)
-{
-}
-
-KTYPE(bch2_fs);
-KTYPE(bch2_fs_counters);
-KTYPE(bch2_fs_internal);
-KTYPE(bch2_fs_opts_dir);
-KTYPE(bch2_fs_time_stats);
-KTYPE(bch2_dev);
-
-static struct kset *bcachefs_kset;
-static LIST_HEAD(bch_fs_list);
-static DEFINE_MUTEX(bch_fs_list_lock);
-
-DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
-
-static void bch2_dev_unlink(struct bch_dev *);
-static void bch2_dev_free(struct bch_dev *);
-static int bch2_dev_alloc(struct bch_fs *, unsigned);
-static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
-static void bch2_dev_io_ref_stop(struct bch_dev *, int);
-static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
-
-struct bch_fs *bch2_dev_to_fs(dev_t dev)
-{
- guard(mutex)(&bch_fs_list_lock);
- guard(rcu)();
-
- struct bch_fs *c;
- list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(c, ca, NULL)
- if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
- closure_get(&c->cl);
- return c;
- }
- return NULL;
-}
-
-static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
-{
- struct bch_fs *c;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
- return c;
-
- return NULL;
-}
-
-struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
-{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- c = __bch2_uuid_to_fs(uuid);
- if (c)
- closure_get(&c->cl);
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
-}
-
-/* Filesystem RO/RW: */
-
-/*
- * For startup/shutdown of RW stuff, the dependencies are:
- *
- * - foreground writes depend on copygc and rebalance (to free up space)
- *
- * - copygc and rebalance depend on mark and sweep gc (they actually probably
- * don't because they either reserve ahead of time or don't block if
- * allocations fail, but allocations can require mark and sweep gc to run
- * because of generation number wraparound)
- *
- * - all of the above depends on the allocator threads
- *
- * - allocator depends on the journal (when it rewrites prios and gens)
- */
-
-static void __bch2_fs_read_only(struct bch_fs *c)
-{
- unsigned clean_passes = 0;
- u64 seq = 0;
-
- bch2_fs_ec_stop(c);
- bch2_open_buckets_stop(c, NULL, true);
- bch2_rebalance_stop(c);
- bch2_copygc_stop(c);
- bch2_fs_ec_flush(c);
-
- bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
- journal_cur_seq(&c->journal));
-
- do {
- clean_passes++;
-
- if (bch2_btree_interior_updates_flush(c) ||
- bch2_btree_write_buffer_flush_going_ro(c) ||
- bch2_journal_flush_all_pins(&c->journal) ||
- bch2_btree_flush_all_writes(c) ||
- seq != atomic64_read(&c->journal.seq)) {
- seq = atomic64_read(&c->journal.seq);
- clean_passes = 0;
- }
- } while (clean_passes < 2);
-
- bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
- journal_cur_seq(&c->journal));
-
- if (test_bit(JOURNAL_replay_done, &c->journal.flags) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags))
- set_bit(BCH_FS_clean_shutdown, &c->flags);
-
- bch2_fs_journal_stop(&c->journal);
-
- bch_info(c, "%sclean shutdown complete, journal seq %llu",
- test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
- c->journal.seq_ondisk);
-
- /*
- * After stopping journal:
- */
- for_each_member_device(c, ca) {
- bch2_dev_io_ref_stop(ca, WRITE);
- bch2_dev_allocator_remove(c, ca);
- }
-}
-
-static void bch2_writes_disabled(struct enumerated_ref *writes)
-{
- struct bch_fs *c = container_of(writes, struct bch_fs, writes);
-
- set_bit(BCH_FS_write_disable_complete, &c->flags);
- wake_up(&bch2_read_only_wait);
-}
-
-void bch2_fs_read_only(struct bch_fs *c)
-{
- if (!test_bit(BCH_FS_rw, &c->flags)) {
- bch2_journal_reclaim_stop(&c->journal);
- return;
- }
-
- BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
-
- bch_verbose(c, "going read-only");
-
- /*
- * Block new foreground-end write operations from starting - any new
- * writes will return -EROFS:
- */
- set_bit(BCH_FS_going_ro, &c->flags);
- enumerated_ref_stop_async(&c->writes);
-
- /*
- * If we're not doing an emergency shutdown, we want to wait on
- * outstanding writes to complete so they don't see spurious errors due
- * to shutting down the allocator:
- *
- * If we are doing an emergency shutdown outstanding writes may
- * hang until we shutdown the allocator so we don't want to wait
- * on outstanding writes before shutting everything down - but
- * we do need to wait on them before returning and signalling
- * that going RO is complete:
- */
- wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_write_disable_complete, &c->flags) ||
- test_bit(BCH_FS_emergency_ro, &c->flags));
-
- bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
- if (writes_disabled)
- bch_verbose(c, "finished waiting for writes to stop");
-
- __bch2_fs_read_only(c);
-
- wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_write_disable_complete, &c->flags));
-
- if (!writes_disabled)
- bch_verbose(c, "finished waiting for writes to stop");
-
- clear_bit(BCH_FS_write_disable_complete, &c->flags);
- clear_bit(BCH_FS_going_ro, &c->flags);
- clear_bit(BCH_FS_rw, &c->flags);
-
- if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_error, &c->flags) &&
- !test_bit(BCH_FS_emergency_ro, &c->flags) &&
- test_bit(BCH_FS_started, &c->flags) &&
- test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
- BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
- BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
- BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
- BUG_ON(c->btree_write_buffer.inc.keys.nr);
- BUG_ON(c->btree_write_buffer.flushing.keys.nr);
- bch2_verify_accounting_clean(c);
-
- bch_verbose(c, "marking filesystem clean");
- bch2_fs_mark_clean(c);
- } else {
- /* Make sure error counts/counters are persisted */
- mutex_lock(&c->sb_lock);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- bch_verbose(c, "done going read-only, filesystem not clean");
- }
-}
-
-static void bch2_fs_read_only_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, read_only_work);
-
- down_write(&c->state_lock);
- bch2_fs_read_only(c);
- up_write(&c->state_lock);
-}
-
-static void bch2_fs_read_only_async(struct bch_fs *c)
-{
- queue_work(system_long_wq, &c->read_only_work);
-}
-
-bool bch2_fs_emergency_read_only(struct bch_fs *c)
-{
- bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
- bch2_journal_halt(&c->journal);
- bch2_fs_read_only_async(c);
-
- wake_up(&bch2_read_only_wait);
- return ret;
-}
-
-static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out,
- bool locked)
-{
- bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
- if (!locked)
- bch2_journal_halt(&c->journal);
- else
- bch2_journal_halt_locked(&c->journal);
- bch2_fs_read_only_async(c);
- wake_up(&bch2_read_only_wait);
-
- if (ret)
- prt_printf(out, "emergency read only at seq %llu\n",
- journal_cur_seq(&c->journal));
-
- return ret;
-}
-
-bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out)
-{
- return __bch2_fs_emergency_read_only2(c, out, false);
-}
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
-{
- bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
-
- bch2_journal_halt_locked(&c->journal);
- bch2_fs_read_only_async(c);
-
- wake_up(&bch2_read_only_wait);
- return ret;
-}
-
-static int __bch2_fs_read_write(struct bch_fs *c, bool early)
-{
- int ret;
-
- BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
-
- if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
- return bch_err_throw(c, erofs_no_alloc_info);
-
- if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
- bch_err(c, "cannot go rw, unfixed btree errors");
- return bch_err_throw(c, erofs_unfixed_errors);
- }
-
- if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
- bch_err(c, "cannot go rw, filesystem is an unresized image file");
- return bch_err_throw(c, erofs_filesystem_full);
- }
-
- if (test_bit(BCH_FS_rw, &c->flags))
- return 0;
-
- bch_info(c, "going read-write");
-
- ret = bch2_fs_init_rw(c);
- if (ret)
- goto err;
-
- ret = bch2_sb_members_v2_init(c);
- if (ret)
- goto err;
-
- clear_bit(BCH_FS_clean_shutdown, &c->flags);
-
- scoped_guard(rcu)
- for_each_online_member_rcu(c, ca)
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- bch2_dev_allocator_add(c, ca);
- enumerated_ref_start(&ca->io_ref[WRITE]);
- }
-
- bch2_recalc_capacity(c);
-
- /*
- * First journal write must be a flush write: after a clean shutdown we
- * don't read the journal, so the first journal write may end up
- * overwriting whatever was there previously, and there must always be
- * at least one non-flush write in the journal or recovery will fail:
- */
- spin_lock(&c->journal.lock);
- set_bit(JOURNAL_need_flush_write, &c->journal.flags);
- set_bit(JOURNAL_running, &c->journal.flags);
- bch2_journal_space_available(&c->journal);
- spin_unlock(&c->journal.lock);
-
- ret = bch2_fs_mark_dirty(c);
- if (ret)
- goto err;
-
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_rw, &c->flags);
- set_bit(BCH_FS_was_rw, &c->flags);
-
- enumerated_ref_start(&c->writes);
-
- ret = bch2_copygc_start(c);
- if (ret) {
- bch_err_msg(c, ret, "error starting copygc thread");
- goto err;
- }
-
- ret = bch2_rebalance_start(c);
- if (ret) {
- bch_err_msg(c, ret, "error starting rebalance thread");
- goto err;
- }
-
- bch2_do_discards(c);
- bch2_do_invalidates(c);
- bch2_do_stripe_deletes(c);
- bch2_do_pending_node_rewrites(c);
- return 0;
-err:
- if (test_bit(BCH_FS_rw, &c->flags))
- bch2_fs_read_only(c);
- else
- __bch2_fs_read_only(c);
- return ret;
-}
-
-int bch2_fs_read_write(struct bch_fs *c)
-{
- if (c->opts.recovery_pass_last &&
- c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
- return bch_err_throw(c, erofs_norecovery);
-
- if (c->opts.nochanges)
- return bch_err_throw(c, erofs_nochanges);
-
- if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
- return bch_err_throw(c, erofs_no_alloc_info);
-
- return __bch2_fs_read_write(c, false);
-}
-
-int bch2_fs_read_write_early(struct bch_fs *c)
-{
- down_write(&c->state_lock);
- int ret = __bch2_fs_read_write(c, true);
- up_write(&c->state_lock);
-
- return ret;
-}
-
-/* Filesystem startup/shutdown: */
-
-static void __bch2_fs_free(struct bch_fs *c)
-{
- for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
- bch2_time_stats_exit(&c->times[i]);
-
-#ifdef CONFIG_UNICODE
- utf8_unload(c->cf_encoding);
-#endif
-
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- bch2_free_pending_node_rewrites(c);
- bch2_free_fsck_errs(c);
- bch2_fs_vfs_exit(c);
- bch2_fs_snapshots_exit(c);
- bch2_fs_sb_errors_exit(c);
- bch2_fs_replicas_exit(c);
- bch2_fs_rebalance_exit(c);
- bch2_fs_quota_exit(c);
- bch2_fs_nocow_locking_exit(c);
- bch2_fs_journal_exit(&c->journal);
- bch2_fs_fs_io_direct_exit(c);
- bch2_fs_fs_io_buffered_exit(c);
- bch2_fs_fsio_exit(c);
- bch2_fs_io_write_exit(c);
- bch2_fs_io_read_exit(c);
- bch2_fs_encryption_exit(c);
- bch2_fs_ec_exit(c);
- bch2_fs_counters_exit(c);
- bch2_fs_compress_exit(c);
- bch2_io_clock_exit(&c->io_clock[WRITE]);
- bch2_io_clock_exit(&c->io_clock[READ]);
- bch2_fs_buckets_waiting_for_journal_exit(c);
- bch2_fs_btree_write_buffer_exit(c);
- bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
- bch2_fs_btree_iter_exit(c);
- bch2_fs_btree_interior_update_exit(c);
- bch2_fs_btree_cache_exit(c);
- bch2_fs_accounting_exit(c);
- bch2_fs_async_obj_exit(c);
- bch2_journal_keys_put_initial(c);
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-
- BUG_ON(atomic_read(&c->journal_keys.ref));
- percpu_free_rwsem(&c->mark_lock);
- if (c->online_reserved) {
- u64 v = percpu_u64_get(c->online_reserved);
- WARN(v, "online_reserved not 0 at shutdown: %lli", v);
- free_percpu(c->online_reserved);
- }
-
- darray_exit(&c->incompat_versions_requested);
- darray_exit(&c->btree_roots_extra);
- free_percpu(c->pcpu);
- free_percpu(c->usage);
- mempool_exit(&c->large_bkey_pool);
- mempool_exit(&c->btree_bounce_pool);
- bioset_exit(&c->btree_bio);
- mempool_exit(&c->fill_iter);
- enumerated_ref_exit(&c->writes);
- kfree(rcu_dereference_protected(c->disk_groups, 1));
- kfree(c->journal_seq_blacklist_table);
-
- if (c->write_ref_wq)
- destroy_workqueue(c->write_ref_wq);
- if (c->btree_write_submit_wq)
- destroy_workqueue(c->btree_write_submit_wq);
- if (c->btree_read_complete_wq)
- destroy_workqueue(c->btree_read_complete_wq);
- if (c->copygc_wq)
- destroy_workqueue(c->copygc_wq);
- if (c->btree_write_complete_wq)
- destroy_workqueue(c->btree_write_complete_wq);
- if (c->btree_update_wq)
- destroy_workqueue(c->btree_update_wq);
-
- bch2_free_super(&c->disk_sb);
- kvfree(c);
- module_put(THIS_MODULE);
-}
-
-static void bch2_fs_release(struct kobject *kobj)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- __bch2_fs_free(c);
-}
-
-void __bch2_fs_stop(struct bch_fs *c)
-{
- bch_verbose(c, "shutting down");
-
- set_bit(BCH_FS_stopping, &c->flags);
-
- down_write(&c->state_lock);
- bch2_fs_read_only(c);
- up_write(&c->state_lock);
-
- for (unsigned i = 0; i < c->sb.nr_devices; i++) {
- struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
- if (ca)
- bch2_dev_io_ref_stop(ca, READ);
- }
-
- for_each_member_device(c, ca)
- bch2_dev_unlink(ca);
-
- if (c->kobj.state_in_sysfs)
- kobject_del(&c->kobj);
-
- bch2_fs_debug_exit(c);
- bch2_fs_chardev_exit(c);
-
- bch2_ro_ref_put(c);
- wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
-
- kobject_put(&c->counters_kobj);
- kobject_put(&c->time_stats);
- kobject_put(&c->opts_dir);
- kobject_put(&c->internal);
-
- /* btree prefetch might have kicked off reads in the background: */
- bch2_btree_flush_all_reads(c);
-
- for_each_member_device(c, ca)
- cancel_work_sync(&ca->io_error_work);
-
- cancel_work_sync(&c->read_only_work);
-}
-
-void bch2_fs_free(struct bch_fs *c)
-{
- mutex_lock(&bch_fs_list_lock);
- list_del(&c->list);
- mutex_unlock(&bch_fs_list_lock);
-
- closure_sync(&c->cl);
- closure_debug_destroy(&c->cl);
-
- for (unsigned i = 0; i < c->sb.nr_devices; i++) {
- struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-
- if (ca) {
- EBUG_ON(atomic_long_read(&ca->ref) != 1);
- bch2_dev_io_ref_stop(ca, READ);
- bch2_free_super(&ca->disk_sb);
- bch2_dev_free(ca);
- }
- }
-
- bch_verbose(c, "shutdown complete");
-
- kobject_put(&c->kobj);
-}
-
-void bch2_fs_stop(struct bch_fs *c)
-{
- __bch2_fs_stop(c);
- bch2_fs_free(c);
-}
-
-static int bch2_fs_online(struct bch_fs *c)
-{
- int ret = 0;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- if (c->sb.multi_device &&
- __bch2_uuid_to_fs(c->sb.uuid)) {
- bch_err(c, "filesystem UUID already open");
- return bch_err_throw(c, filesystem_uuid_already_open);
- }
-
- ret = bch2_fs_chardev_init(c);
- if (ret) {
- bch_err(c, "error creating character device");
- return ret;
- }
-
- bch2_fs_debug_init(c);
-
- ret = (c->sb.multi_device
- ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b)
- : kobject_add(&c->kobj, NULL, "%s", c->name)) ?:
- kobject_add(&c->internal, &c->kobj, "internal") ?:
- kobject_add(&c->opts_dir, &c->kobj, "options") ?:
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
- kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
-#endif
- kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
- bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
- if (ret) {
- bch_err(c, "error creating sysfs objects");
- return ret;
- }
-
- down_write(&c->state_lock);
-
- for_each_member_device(c, ca) {
- ret = bch2_dev_sysfs_online(c, ca);
- if (ret) {
- bch_err(c, "error creating sysfs objects");
- bch2_dev_put(ca);
- goto err;
- }
- }
-
- BUG_ON(!list_empty(&c->list));
- list_add(&c->list, &bch_fs_list);
-err:
- up_write(&c->state_lock);
- return ret;
-}
-
-int bch2_fs_init_rw(struct bch_fs *c)
-{
- if (test_bit(BCH_FS_rw_init_done, &c->flags))
- return 0;
-
- if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
- !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
- WQ_FREEZABLE, 0)))
- return bch_err_throw(c, ENOMEM_fs_other_alloc);
-
- int ret = bch2_fs_btree_interior_update_init(c) ?:
- bch2_fs_btree_write_buffer_init(c) ?:
- bch2_fs_fs_io_buffered_init(c) ?:
- bch2_fs_io_write_init(c) ?:
- bch2_fs_journal_init(&c->journal);
- if (ret)
- return ret;
-
- set_bit(BCH_FS_rw_init_done, &c->flags);
- return 0;
-}
-
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
- bch_sb_handles *sbs)
-{
- struct bch_fs *c;
- struct printbuf name = PRINTBUF;
- unsigned i, iter_size;
- int ret = 0;
-
- c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
- if (!c) {
- c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
- goto out;
- }
-
- c->stdio = (void *)(unsigned long) opts->stdio;
-
- __module_get(THIS_MODULE);
-
- closure_init(&c->cl, NULL);
-
- c->kobj.kset = bcachefs_kset;
- kobject_init(&c->kobj, &bch2_fs_ktype);
- kobject_init(&c->internal, &bch2_fs_internal_ktype);
- kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
- kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
- kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
-
- c->minor = -1;
- c->disk_sb.fs_sb = true;
-
- init_rwsem(&c->state_lock);
- mutex_init(&c->sb_lock);
- mutex_init(&c->replicas_gc_lock);
- mutex_init(&c->btree_root_lock);
- INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
-
- refcount_set(&c->ro_ref, 1);
- init_waitqueue_head(&c->ro_ref_wait);
-
- for (i = 0; i < BCH_TIME_STAT_NR; i++)
- bch2_time_stats_init(&c->times[i]);
-
- bch2_fs_allocator_background_init(c);
- bch2_fs_allocator_foreground_init(c);
- bch2_fs_btree_cache_init_early(&c->btree_cache);
- bch2_fs_btree_gc_init_early(c);
- bch2_fs_btree_interior_update_init_early(c);
- bch2_fs_btree_iter_init_early(c);
- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
- bch2_fs_btree_write_buffer_init_early(c);
- bch2_fs_copygc_init(c);
- bch2_fs_ec_init_early(c);
- bch2_fs_journal_init_early(&c->journal);
- bch2_fs_journal_keys_init(c);
- bch2_fs_move_init(c);
- bch2_fs_nocow_locking_init_early(c);
- bch2_fs_quota_init(c);
- bch2_fs_recovery_passes_init(c);
- bch2_fs_sb_errors_init_early(c);
- bch2_fs_snapshots_init_early(c);
- bch2_fs_subvolumes_init_early(c);
-
- INIT_LIST_HEAD(&c->list);
-
- mutex_init(&c->bio_bounce_pages_lock);
- mutex_init(&c->snapshot_table_lock);
- init_rwsem(&c->snapshot_create_lock);
-
- spin_lock_init(&c->btree_write_error_lock);
-
- INIT_LIST_HEAD(&c->journal_iters);
-
- INIT_LIST_HEAD(&c->fsck_error_msgs);
- mutex_init(&c->fsck_error_msgs_lock);
-
- seqcount_init(&c->usage_lock);
-
- sema_init(&c->io_in_flight, 128);
-
- INIT_LIST_HEAD(&c->vfs_inodes_list);
- mutex_init(&c->vfs_inodes_lock);
-
- c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
- c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
-
- mutex_init(&c->sectors_available_lock);
-
- ret = percpu_init_rwsem(&c->mark_lock);
- if (ret)
- goto err;
-
- mutex_lock(&c->sb_lock);
- ret = bch2_sb_to_fs(c, sb);
- mutex_unlock(&c->sb_lock);
-
- if (ret)
- goto err;
-
- /* Compat: */
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
- !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
- SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
-
- if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
- !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
- SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
-
- c->opts = bch2_opts_default;
- ret = bch2_opts_from_sb(&c->opts, sb);
- if (ret)
- goto err;
-
- bch2_opts_apply(&c->opts, *opts);
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- c->opts.block_size > PAGE_SIZE) {
- bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE");
- ret = -EINVAL;
- goto err;
- }
-
- c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
- if (c->opts.inodes_use_key_cache)
- c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
- c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
-
- c->block_bits = ilog2(block_sectors(c));
- c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
-
- if (bch2_fs_init_fault("fs_alloc")) {
- bch_err(c, "fs_alloc fault injected");
- ret = -EFAULT;
- goto err;
- }
-
- if (c->sb.multi_device)
- pr_uuid(&name, c->sb.user_uuid.b);
- else
- prt_bdevname(&name, sbs->data[0].bdev);
-
- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
- if (ret)
- goto err;
-
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
- iter_size = sizeof(struct sort_iter) +
- (btree_blocks(c) + 1) * 2 *
- sizeof(struct sort_iter_set);
-
- if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
- enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
- bch2_writes_disabled) ||
- mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->btree_bio, 1,
- max(offsetof(struct btree_read_bio, bio),
- offsetof(struct btree_write_bio, wbio.bio)),
- BIOSET_NEED_BVECS) ||
- !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
- !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
- !(c->online_reserved = alloc_percpu(u64)) ||
- mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
- c->opts.btree_node_size) ||
- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
- ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
- goto err;
- }
-
- ret =
- bch2_fs_async_obj_init(c) ?:
- bch2_fs_btree_cache_init(c) ?:
- bch2_fs_btree_iter_init(c) ?:
- bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_buckets_waiting_for_journal_init(c) ?:
- bch2_io_clock_init(&c->io_clock[READ]) ?:
- bch2_io_clock_init(&c->io_clock[WRITE]) ?:
- bch2_fs_compress_init(c) ?:
- bch2_fs_counters_init(c) ?:
- bch2_fs_ec_init(c) ?:
- bch2_fs_encryption_init(c) ?:
- bch2_fs_fsio_init(c) ?:
- bch2_fs_fs_io_direct_init(c) ?:
- bch2_fs_io_read_init(c) ?:
- bch2_fs_rebalance_init(c) ?:
- bch2_fs_sb_errors_init(c) ?:
- bch2_fs_vfs_init(c);
- if (ret)
- goto err;
-
- if (go_rw_in_recovery(c)) {
- /*
- * start workqueues/kworkers early - kthread creation checks for
- * pending signals, which is _very_ annoying
- */
- ret = bch2_fs_init_rw(c);
- if (ret)
- goto err;
- }
-
-#ifdef CONFIG_UNICODE
- if (bch2_fs_casefold_enabled(c)) {
- /* Default encoding until we can potentially have more as an option. */
- c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
- if (IS_ERR(c->cf_encoding)) {
- printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
- ret = -EINVAL;
- goto err;
- }
- }
-#else
- if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
- printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
- ret = -EINVAL;
- goto err;
- }
-#endif
-
- for (i = 0; i < c->sb.nr_devices; i++) {
- if (!bch2_member_exists(c->disk_sb.sb, i))
- continue;
- ret = bch2_dev_alloc(c, i);
- if (ret)
- goto err;
- }
-
- bch2_journal_entry_res_resize(&c->journal,
- &c->btree_root_journal_res,
- BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
- bch2_journal_entry_res_resize(&c->journal,
- &c->clock_journal_res,
- (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
-
- mutex_lock(&bch_fs_list_lock);
- ret = bch2_fs_online(c);
- mutex_unlock(&bch_fs_list_lock);
-
- if (ret)
- goto err;
-out:
- return c;
-err:
- bch2_fs_free(c);
- c = ERR_PTR(ret);
- goto out;
-}
-
-noinline_for_stack
-static void print_mount_opts(struct bch_fs *c)
-{
- enum bch_opt_id i;
- CLASS(printbuf, p)();
- bch2_log_msg_start(c, &p);
-
- prt_str(&p, "starting version ");
- bch2_version_to_text(&p, c->sb.version);
-
- bool first = true;
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
-
- if (!(opt->flags & OPT_MOUNT))
- continue;
-
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- prt_str(&p, first ? " opts=" : ",");
- first = false;
- bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
- }
-
- if (c->sb.version_incompat_allowed != c->sb.version) {
- prt_printf(&p, "\nallowing incompatible features above ");
- bch2_version_to_text(&p, c->sb.version_incompat_allowed);
- }
-
- if (c->opts.verbose) {
- prt_printf(&p, "\nfeatures: ");
- prt_bitflags(&p, bch2_sb_features, c->sb.features);
- }
-
- if (c->sb.multi_device) {
- prt_printf(&p, "\nwith devices");
- for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
- prt_char(&p, ' ');
- prt_str(&p, ca->name);
- }
- }
-
- bch2_print_str(c, KERN_INFO, p.buf);
-}
-
-static bool bch2_fs_may_start(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned flags = 0;
-
- switch (c->opts.degraded) {
- case BCH_DEGRADED_very:
- flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
- break;
- case BCH_DEGRADED_yes:
- flags |= BCH_FORCE_IF_DEGRADED;
- break;
- default:
- mutex_lock(&c->sb_lock);
- for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_member_exists(c->disk_sb.sb, i))
- continue;
-
- ca = bch2_dev_locked(c, i);
-
- if (!bch2_dev_is_online(ca) &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- ca->mi.state == BCH_MEMBER_STATE_ro)) {
- mutex_unlock(&c->sb_lock);
- return false;
- }
- }
- mutex_unlock(&c->sb_lock);
- break;
- }
-
- return bch2_have_enough_devs(c, c->online_devs, flags, true);
-}
-
-int bch2_fs_start(struct bch_fs *c)
-{
- time64_t now = ktime_get_real_seconds();
- int ret = 0;
-
- print_mount_opts(c);
-
- if (c->cf_encoding)
- bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
-
- if (!bch2_fs_may_start(c))
- return bch_err_throw(c, insufficient_devices_to_start);
-
- down_write(&c->state_lock);
- mutex_lock(&c->sb_lock);
-
- BUG_ON(test_bit(BCH_FS_started, &c->flags));
-
- if (!bch2_sb_field_get_minsize(&c->disk_sb, ext,
- sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
- mutex_unlock(&c->sb_lock);
- up_write(&c->state_lock);
- ret = bch_err_throw(c, ENOSPC_sb);
- goto err;
- }
-
- ret = bch2_sb_members_v2_init(c);
- if (ret) {
- mutex_unlock(&c->sb_lock);
- up_write(&c->state_lock);
- goto err;
- }
-
- scoped_guard(rcu)
- for_each_online_member_rcu(c, ca)
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
- cpu_to_le64(now);
-
- /*
- * Dno't write superblock yet: recovery might have to downgrade
- */
- mutex_unlock(&c->sb_lock);
-
- scoped_guard(rcu)
- for_each_online_member_rcu(c, ca)
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- bch2_dev_allocator_add(c, ca);
- bch2_recalc_capacity(c);
- up_write(&c->state_lock);
-
- c->recovery_task = current;
- ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
- ? bch2_fs_recovery(c)
- : bch2_fs_initialize(c);
- c->recovery_task = NULL;
-
- if (ret)
- goto err;
-
- ret = bch2_opts_hooks_pre_set(c);
- if (ret)
- goto err;
-
- if (bch2_fs_init_fault("fs_start")) {
- ret = bch_err_throw(c, injected_fs_start);
- goto err;
- }
-
- set_bit(BCH_FS_started, &c->flags);
- wake_up(&c->ro_ref_wait);
-
- down_write(&c->state_lock);
- if (c->opts.read_only)
- bch2_fs_read_only(c);
- else if (!test_bit(BCH_FS_rw, &c->flags))
- ret = bch2_fs_read_write(c);
- up_write(&c->state_lock);
-
-err:
- if (ret)
- bch_err_msg(c, ret, "starting filesystem");
- else
- bch_verbose(c, "done starting filesystem");
- return ret;
-}
-
-static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
-{
- struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
-
- if (le16_to_cpu(sb->block_size) != block_sectors(c))
- return bch_err_throw(c, mismatched_block_size);
-
- if (le16_to_cpu(m.bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
- return bch_err_throw(c, bucket_size_too_small);
-
- return 0;
-}
-
-static int bch2_dev_in_fs(struct bch_sb_handle *fs,
- struct bch_sb_handle *sb,
- struct bch_opts *opts)
-{
- if (fs == sb)
- return 0;
-
- if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
- return -BCH_ERR_device_not_a_member_of_filesystem;
-
- if (!bch2_member_exists(fs->sb, sb->sb->dev_idx))
- return -BCH_ERR_device_has_been_removed;
-
- if (fs->sb->block_size != sb->sb->block_size)
- return -BCH_ERR_mismatched_block_size;
-
- if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
- le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
- return 0;
-
- if (fs->sb->seq == sb->sb->seq &&
- fs->sb->write_time != sb->sb->write_time) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Split brain detected between ");
- prt_bdevname(&buf, sb->bdev);
- prt_str(&buf, " and ");
- prt_bdevname(&buf, fs->bdev);
- prt_char(&buf, ':');
- prt_newline(&buf);
- prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
- prt_newline(&buf);
-
- prt_bdevname(&buf, fs->bdev);
- prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
- prt_newline(&buf);
-
- prt_bdevname(&buf, sb->bdev);
- prt_char(&buf, ' ');
- bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
- prt_newline(&buf);
-
- if (!opts->no_splitbrain_check)
- prt_printf(&buf, "Not using older sb");
-
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
-
- if (!opts->no_splitbrain_check)
- return -BCH_ERR_device_splitbrain;
- }
-
- struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
- u64 seq_from_fs = le64_to_cpu(m.seq);
- u64 seq_from_member = le64_to_cpu(sb->sb->seq);
-
- if (seq_from_fs && seq_from_fs < seq_from_member) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "Split brain detected between ");
- prt_bdevname(&buf, sb->bdev);
- prt_str(&buf, " and ");
- prt_bdevname(&buf, fs->bdev);
- prt_char(&buf, ':');
- prt_newline(&buf);
-
- prt_bdevname(&buf, fs->bdev);
- prt_str(&buf, " believes seq of ");
- prt_bdevname(&buf, sb->bdev);
- prt_printf(&buf, " to be %llu, but ", seq_from_fs);
- prt_bdevname(&buf, sb->bdev);
- prt_printf(&buf, " has %llu\n", seq_from_member);
-
- if (!opts->no_splitbrain_check) {
- prt_str(&buf, "Not using ");
- prt_bdevname(&buf, sb->bdev);
- }
-
- pr_err("%s", buf.buf);
- printbuf_exit(&buf);
-
- if (!opts->no_splitbrain_check)
- return -BCH_ERR_device_splitbrain;
- }
-
- return 0;
-}
-
-/* Device startup/shutdown: */
-
-static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
-{
- if (rw == READ)
- clear_bit(ca->dev_idx, ca->fs->online_devs.d);
-
- if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
- enumerated_ref_stop(&ca->io_ref[rw],
- rw == READ
- ? bch2_dev_read_refs
- : bch2_dev_write_refs);
-}
-
-static void bch2_dev_release(struct kobject *kobj)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-
- kfree(ca);
-}
-
-static void bch2_dev_free(struct bch_dev *ca)
-{
- WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
- WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
-
- cancel_work_sync(&ca->io_error_work);
-
- bch2_dev_unlink(ca);
-
- if (ca->kobj.state_in_sysfs)
- kobject_del(&ca->kobj);
-
- bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
- bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
-
- bch2_free_super(&ca->disk_sb);
- bch2_dev_allocator_background_exit(ca);
- bch2_dev_journal_exit(ca);
-
- free_percpu(ca->io_done);
- bch2_dev_buckets_free(ca);
- kfree(ca->sb_read_scratch);
-
- bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
-
- enumerated_ref_exit(&ca->io_ref[WRITE]);
- enumerated_ref_exit(&ca->io_ref[READ]);
-#ifndef CONFIG_BCACHEFS_DEBUG
- percpu_ref_exit(&ca->ref);
-#endif
- kobject_put(&ca->kobj);
-}
-
-static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
-{
-
- lockdep_assert_held(&c->state_lock);
-
- if (enumerated_ref_is_zero(&ca->io_ref[READ]))
- return;
-
- __bch2_dev_read_only(c, ca);
-
- bch2_dev_io_ref_stop(ca, READ);
-
- bch2_dev_unlink(ca);
-
- bch2_free_super(&ca->disk_sb);
- bch2_dev_journal_exit(ca);
-}
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-static void bch2_dev_ref_complete(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
-
- complete(&ca->ref_completion);
-}
-#endif
-
-static void bch2_dev_unlink(struct bch_dev *ca)
-{
- struct kobject *b;
-
- /*
- * This is racy w.r.t. the underlying block device being hot-removed,
- * which removes it from sysfs.
- *
- * It'd be lovely if we had a way to handle this race, but the sysfs
- * code doesn't appear to provide a good method and block/holder.c is
- * susceptible as well:
- */
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev &&
- (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
- sysfs_remove_link(b, "bcachefs");
- sysfs_remove_link(&ca->kobj, "block");
- }
-}
-
-static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
-{
- int ret;
-
- if (!c->kobj.state_in_sysfs)
- return 0;
-
- if (!ca->kobj.state_in_sysfs) {
- ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
- bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
- if (ret)
- return ret;
- }
-
- if (ca->disk_sb.bdev) {
- struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
-
- ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
- if (ret)
- return ret;
-
- ret = sysfs_create_link(&ca->kobj, block, "block");
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
- struct bch_member *member)
-{
- struct bch_dev *ca;
- unsigned i;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- return NULL;
-
- kobject_init(&ca->kobj, &bch2_dev_ktype);
- init_completion(&ca->ref_completion);
-
- INIT_WORK(&ca->io_error_work, bch2_io_error_work);
-
- bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
- bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
-
- ca->mi = bch2_mi_to_cpu(member);
-
- for (i = 0; i < ARRAY_SIZE(member->errors); i++)
- atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
-
- ca->uuid = member->uuid;
-
- ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
-
-#ifndef CONFIG_BCACHEFS_DEBUG
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL))
- goto err;
-#else
- atomic_long_set(&ca->ref, 1);
-#endif
-
- mutex_init(&ca->bucket_backpointer_mismatch.lock);
- mutex_init(&ca->bucket_backpointer_empty.lock);
-
- bch2_dev_allocator_background_init(ca);
-
- if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) ||
- enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
- !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
- bch2_dev_buckets_alloc(c, ca) ||
- !(ca->io_done = alloc_percpu(*ca->io_done)))
- goto err;
-
- return ca;
-err:
- bch2_dev_free(ca);
- return NULL;
-}
-
-static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
- unsigned dev_idx)
-{
- ca->dev_idx = dev_idx;
- __set_bit(ca->dev_idx, ca->self.d);
-
- if (!ca->name[0])
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
-
- ca->fs = c;
- rcu_assign_pointer(c->devs[ca->dev_idx], ca);
-
- if (bch2_dev_sysfs_online(c, ca))
- pr_warn("error creating sysfs objects");
-}
-
-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
-{
- struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- struct bch_dev *ca = NULL;
-
- if (bch2_fs_init_fault("dev_alloc"))
- goto err;
-
- ca = __bch2_dev_alloc(c, &member);
- if (!ca)
- goto err;
-
- ca->fs = c;
-
- bch2_dev_attach(c, ca, dev_idx);
- return 0;
-err:
- return bch_err_throw(c, ENOMEM_dev_alloc);
-}
-
-static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
-{
- unsigned ret;
-
- if (bch2_dev_is_online(ca)) {
- bch_err(ca, "already have device online in slot %u",
- sb->sb->dev_idx);
- return bch_err_throw(ca->fs, device_already_online);
- }
-
- if (get_capacity(sb->bdev->bd_disk) <
- ca->mi.bucket_size * ca->mi.nbuckets) {
- bch_err(ca, "cannot online: device too small");
- return bch_err_throw(ca->fs, device_size_too_small);
- }
-
- BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
- BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
-
- ret = bch2_dev_journal_init(ca, sb->sb);
- if (ret)
- return ret;
-
- struct printbuf name = PRINTBUF;
- prt_bdevname(&name, sb->bdev);
- strscpy(ca->name, name.buf, sizeof(ca->name));
- printbuf_exit(&name);
-
- /* Commit: */
- ca->disk_sb = *sb;
- memset(sb, 0, sizeof(*sb));
-
- /*
- * Stash pointer to the filesystem for blk_holder_ops - note that once
- * attached to a filesystem, we will always close the block device
- * before tearing down the filesystem object.
- */
- ca->disk_sb.holder->c = ca->fs;
-
- ca->dev = ca->disk_sb.bdev->bd_dev;
-
- enumerated_ref_start(&ca->io_ref[READ]);
-
- return 0;
-}
-
-static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
-{
- struct bch_dev *ca;
- int ret;
-
- lockdep_assert_held(&c->state_lock);
-
- if (le64_to_cpu(sb->sb->seq) >
- le64_to_cpu(c->disk_sb.sb->seq))
- bch2_sb_to_fs(c, sb->sb);
-
- BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));
-
- ca = bch2_dev_locked(c, sb->sb->dev_idx);
-
- ret = __bch2_dev_attach_bdev(ca, sb);
- if (ret)
- return ret;
-
- set_bit(ca->dev_idx, c->online_devs.d);
-
- bch2_dev_sysfs_online(c, ca);
-
- bch2_rebalance_wakeup(c);
- return 0;
-}
-
-/* Device management: */
-
-/*
- * Note: this function is also used by the error paths - when a particular
- * device sees an error, we call it to determine whether we can just set the
- * device RO, or - if this function returns false - we'll set the whole
- * filesystem RO:
- *
- * XXX: maybe we should be more explicit about whether we're changing state
- * because we got an error or what have you?
- */
-bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- struct bch_devs_mask new_online_devs;
- int nr_rw = 0, required;
-
- lockdep_assert_held(&c->state_lock);
-
- switch (new_state) {
- case BCH_MEMBER_STATE_rw:
- return true;
- case BCH_MEMBER_STATE_ro:
- if (ca->mi.state != BCH_MEMBER_STATE_rw)
- return true;
-
- /* do we have enough devices to write to? */
- for_each_member_device(c, ca2)
- if (ca2 != ca)
- nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
-
- required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
- ? c->opts.metadata_replicas
- : metadata_replicas_required(c),
- !(flags & BCH_FORCE_IF_DATA_DEGRADED)
- ? c->opts.data_replicas
- : data_replicas_required(c));
-
- return nr_rw >= required;
- case BCH_MEMBER_STATE_failed:
- case BCH_MEMBER_STATE_spare:
- if (ca->mi.state != BCH_MEMBER_STATE_rw &&
- ca->mi.state != BCH_MEMBER_STATE_ro)
- return true;
-
- /* do we have enough devices to read from? */
- new_online_devs = c->online_devs;
- __clear_bit(ca->dev_idx, new_online_devs.d);
-
- return bch2_have_enough_devs(c, new_online_devs, flags, false);
- default:
- BUG();
- }
-}
-
-static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
-{
- bch2_dev_io_ref_stop(ca, WRITE);
-
- /*
- * The allocator thread itself allocates btree nodes, so stop it first:
- */
- bch2_dev_allocator_remove(c, ca);
- bch2_recalc_capacity(c);
- bch2_dev_journal_stop(&c->journal, ca);
-}
-
-static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
-
- bch2_dev_allocator_add(c, ca);
- bch2_recalc_capacity(c);
-
- if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
- enumerated_ref_start(&ca->io_ref[WRITE]);
-
- bch2_dev_do_discards(ca);
-}
-
-int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- struct bch_member *m;
- int ret = 0;
-
- if (ca->mi.state == new_state)
- return 0;
-
- if (!bch2_dev_state_allowed(c, ca, new_state, flags))
- return bch_err_throw(c, device_state_not_allowed);
-
- if (new_state != BCH_MEMBER_STATE_rw)
- __bch2_dev_read_only(c, ca);
-
- bch_notice(ca, "%s", bch2_member_states[new_state]);
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_STATE(m, new_state);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (new_state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
-
- bch2_rebalance_wakeup(c);
-
- return ret;
-}
-
-int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- int ret;
-
- down_write(&c->state_lock);
- ret = __bch2_dev_set_state(c, ca, new_state, flags);
- up_write(&c->state_lock);
-
- return ret;
-}
-
-/* Device add/removal: */
-
-int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- struct bch_member *m;
- unsigned dev_idx = ca->dev_idx, data;
- bool fast_device_removal = !bch2_request_incompat_feature(c,
- bcachefs_metadata_version_fast_device_removal);
- int ret;
-
- down_write(&c->state_lock);
-
- /*
- * We consume a reference to ca->ref, regardless of whether we succeed
- * or fail:
- */
- bch2_dev_put(ca);
-
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot remove without losing data");
- ret = bch_err_throw(c, device_state_not_allowed);
- goto err;
- }
-
- __bch2_dev_read_only(c, ca);
-
- ret = fast_device_removal
- ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
- : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
- bch2_dev_remove_stripes(c, ca->dev_idx, flags));
- if (ret)
- goto err;
-
- /* Check if device still has data before blowing away alloc info */
- struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- if (!data_type_is_empty(i) &&
- !data_type_is_hidden(i) &&
- usage.buckets[i]) {
- bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
- __bch2_data_types[i], usage.buckets[i]);
- ret = -EBUSY;
- goto err;
- }
-
- ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
- if (ret)
- goto err;
-
- /*
- * We need to flush the entire journal to get rid of keys that reference
- * the device being removed before removing the superblock entry
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
- * this is really just needed for the bch2_replicas_gc_(start|end)
- * calls, and could be cleaned up:
- */
- ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
- if (ret)
- goto err;
-
- ret = bch2_journal_flush(&c->journal);
- bch_err_msg(ca, ret, "bch2_journal_flush()");
- if (ret)
- goto err;
-
- ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "bch2_replicas_gc2()");
- if (ret)
- goto err;
-
- data = bch2_dev_has_data(c, ca);
- if (data) {
- struct printbuf data_has = PRINTBUF;
-
- prt_bitflags(&data_has, __bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
- printbuf_exit(&data_has);
- ret = -EBUSY;
- goto err;
- }
-
- __bch2_dev_offline(c, ca);
-
- mutex_lock(&c->sb_lock);
- rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
- mutex_unlock(&c->sb_lock);
-
-#ifndef CONFIG_BCACHEFS_DEBUG
- percpu_ref_kill(&ca->ref);
-#else
- ca->dying = true;
- bch2_dev_put(ca);
-#endif
- wait_for_completion(&ca->ref_completion);
-
- bch2_dev_free(ca);
-
- /*
- * Free this device's slot in the bch_member array - all pointers to
- * this device must be gone:
- */
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
-
- if (fast_device_removal)
- m->uuid = BCH_SB_MEMBER_DELETED_UUID;
- else
- memset(&m->uuid, 0, sizeof(m->uuid));
-
- bch2_write_super(c);
-
- mutex_unlock(&c->sb_lock);
- up_write(&c->state_lock);
- return 0;
-err:
- if (test_bit(BCH_FS_rw, &c->flags) &&
- ca->mi.state == BCH_MEMBER_STATE_rw &&
- !enumerated_ref_is_zero(&ca->io_ref[READ]))
- __bch2_dev_read_write(c, ca);
- up_write(&c->state_lock);
- return ret;
-}
-
-/* Add new device to running filesystem: */
-int bch2_dev_add(struct bch_fs *c, const char *path)
-{
- struct bch_opts opts = bch2_opts_empty();
- struct bch_sb_handle sb = {};
- struct bch_dev *ca = NULL;
- struct printbuf errbuf = PRINTBUF;
- struct printbuf label = PRINTBUF;
- int ret = 0;
-
- ret = bch2_read_super(path, &opts, &sb);
- bch_err_msg(c, ret, "reading super");
- if (ret)
- goto err;
-
- struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
-
- if (BCH_MEMBER_GROUP(&dev_mi)) {
- bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
- if (label.allocation_failure) {
- ret = -ENOMEM;
- goto err;
- }
- }
-
- if (list_empty(&c->list)) {
- mutex_lock(&bch_fs_list_lock);
- if (__bch2_uuid_to_fs(c->sb.uuid))
- ret = bch_err_throw(c, filesystem_uuid_already_open);
- else
- list_add(&c->list, &bch_fs_list);
- mutex_unlock(&bch_fs_list_lock);
-
- if (ret) {
- bch_err(c, "filesystem UUID already open");
- goto err;
- }
- }
-
- ret = bch2_dev_may_add(sb.sb, c);
- if (ret)
- goto err;
-
- ca = __bch2_dev_alloc(c, &dev_mi);
- if (!ca) {
- ret = -ENOMEM;
- goto err;
- }
-
- ret = __bch2_dev_attach_bdev(ca, &sb);
- if (ret)
- goto err;
-
- down_write(&c->state_lock);
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
-
- ret = bch2_sb_from_fs(c, ca);
- bch_err_msg(c, ret, "setting up new superblock");
- if (ret)
- goto err_unlock;
-
- if (dynamic_fault("bcachefs:add:no_slot"))
- goto err_unlock;
-
- ret = bch2_sb_member_alloc(c);
- if (ret < 0) {
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
- }
- unsigned dev_idx = ret;
- ret = 0;
-
- /* success: */
-
- dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
- *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
-
- ca->disk_sb.sb->dev_idx = dev_idx;
- bch2_dev_attach(c, ca, dev_idx);
-
- if (BCH_MEMBER_GROUP(&dev_mi)) {
- ret = __bch2_dev_group_set(c, ca, label.buf);
- bch_err_msg(c, ret, "creating new label");
- if (ret)
- goto err_unlock;
- }
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (test_bit(BCH_FS_started, &c->flags)) {
- ret = bch2_dev_usage_init(ca, false);
- if (ret)
- goto err_late;
-
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(ca, ret, "marking new superblock");
- if (ret)
- goto err_late;
-
- ret = bch2_fs_freespace_init(c);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
- goto err_late;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
-
- ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
- goto err_late;
- }
-
- /*
- * We just changed the superblock UUID, invalidate cache and send a
- * uevent to update /dev/disk/by-uuid
- */
- invalidate_bdev(ca->disk_sb.bdev);
-
- char uuid_str[37];
- snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid);
-
- char *envp[] = {
- "CHANGE=uuid",
- uuid_str,
- NULL,
- };
- kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
-
- up_write(&c->state_lock);
-out:
- printbuf_exit(&label);
- printbuf_exit(&errbuf);
- bch_err_fn(c, ret);
- return ret;
-
-err_unlock:
- mutex_unlock(&c->sb_lock);
- up_write(&c->state_lock);
-err:
- if (ca)
- bch2_dev_free(ca);
- bch2_free_super(&sb);
- goto out;
-err_late:
- up_write(&c->state_lock);
- ca = NULL;
- goto err;
-}
-
-/* Hot add existing device to running filesystem: */
-int bch2_dev_online(struct bch_fs *c, const char *path)
-{
- struct bch_opts opts = bch2_opts_empty();
- struct bch_sb_handle sb = { NULL };
- struct bch_dev *ca;
- unsigned dev_idx;
- int ret;
-
- down_write(&c->state_lock);
-
- ret = bch2_read_super(path, &opts, &sb);
- if (ret) {
- up_write(&c->state_lock);
- return ret;
- }
-
- dev_idx = sb.sb->dev_idx;
-
- ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
- bch_err_msg(c, ret, "bringing %s online", path);
- if (ret)
- goto err;
-
- ret = bch2_dev_attach_bdev(c, &sb);
- if (ret)
- goto err;
-
- ca = bch2_dev_locked(c, dev_idx);
-
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
- if (ret)
- goto err;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
-
- if (!ca->mi.freespace_initialized) {
- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
- goto err;
- }
-
- if (!ca->journal.nr) {
- ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(ca, ret, "allocating journal");
- if (ret)
- goto err;
- }
-
- mutex_lock(&c->sb_lock);
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
- cpu_to_le64(ktime_get_real_seconds());
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- up_write(&c->state_lock);
- return 0;
-err:
- up_write(&c->state_lock);
- bch2_free_super(&sb);
- return ret;
-}
-
-int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- down_write(&c->state_lock);
-
- if (!bch2_dev_is_online(ca)) {
- bch_err(ca, "Already offline");
- up_write(&c->state_lock);
- return 0;
- }
-
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot offline required disk");
- up_write(&c->state_lock);
- return bch_err_throw(c, device_state_not_allowed);
- }
-
- __bch2_dev_offline(c, ca);
-
- up_write(&c->state_lock);
- return 0;
-}
-
-static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
-{
- struct bch_fs *c = ca->fs;
- u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
-
- return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
- bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
- .dev = ca->dev_idx,
- .data_type = BCH_DATA_free)) ?:
- bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
-}
-
-int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
-{
- struct bch_member *m;
- u64 old_nbuckets;
- int ret = 0;
-
- down_write(&c->state_lock);
- old_nbuckets = ca->mi.nbuckets;
-
- if (nbuckets < ca->mi.nbuckets) {
- bch_err(ca, "Cannot shrink yet");
- ret = -EINVAL;
- goto err;
- }
-
- if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
- bch_err(ca, "New device size too big (%llu greater than max %u)",
- nbuckets, BCH_MEMBER_NBUCKETS_MAX);
- ret = bch_err_throw(c, device_size_too_big);
- goto err;
- }
-
- if (bch2_dev_is_online(ca) &&
- get_capacity(ca->disk_sb.bdev->bd_disk) <
- ca->mi.bucket_size * nbuckets) {
- bch_err(ca, "New size larger than device");
- ret = bch_err_throw(c, device_size_too_small);
- goto err;
- }
-
- ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- bch_err_msg(ca, ret, "resizing buckets");
- if (ret)
- goto err;
-
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- if (ret)
- goto err;
-
- mutex_lock(&c->sb_lock);
- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- m->nbuckets = cpu_to_le64(nbuckets);
-
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (ca->mi.freespace_initialized) {
- ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
- if (ret)
- goto err;
- }
-
- bch2_recalc_capacity(c);
-err:
- up_write(&c->state_lock);
- return ret;
-}
-
-int bch2_fs_resize_on_mount(struct bch_fs *c)
-{
- for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
- u64 old_nbuckets = ca->mi.nbuckets;
- u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
- ca->mi.bucket_size);
-
- if (ca->mi.resize_on_mount &&
- new_nbuckets > ca->mi.nbuckets) {
- bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
- int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
- bch_err_fn(ca, ret);
- if (ret) {
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_fs_resize_on_mount);
- up_write(&c->state_lock);
- return ret;
- }
-
- mutex_lock(&c->sb_lock);
- struct bch_member *m =
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- m->nbuckets = cpu_to_le64(new_nbuckets);
- SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false);
-
- c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image));
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (ca->mi.freespace_initialized) {
- ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets);
- if (ret) {
- enumerated_ref_put(&ca->io_ref[READ],
- BCH_DEV_READ_REF_fs_resize_on_mount);
- up_write(&c->state_lock);
- return ret;
- }
- }
- }
- }
- return 0;
-}
-
-/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
-{
- if (!strncmp(name, "/dev/", strlen("/dev/")))
- name += strlen("/dev/");
-
- for_each_member_device(c, ca)
- if (!strcmp(name, ca->name))
- return ca;
- return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-}
-
-/* blk_holder_ops: */
-
-static struct bch_fs *bdev_get_fs(struct block_device *bdev)
- __releases(&bdev->bd_holder_lock)
-{
- struct bch_sb_handle_holder *holder = bdev->bd_holder;
- struct bch_fs *c = holder->c;
-
- if (c && !bch2_ro_ref_tryget(c))
- c = NULL;
-
- mutex_unlock(&bdev->bd_holder_lock);
-
- if (c)
- wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
- return c;
-}
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
-{
- for_each_member_device(c, ca)
- if (ca->disk_sb.bdev == bdev)
- return ca;
- return NULL;
-}
-
-static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
-{
- struct bch_fs *c = bdev_get_fs(bdev);
- if (!c)
- return;
-
- struct super_block *sb = c->vfs_sb;
- if (sb) {
- /*
- * Not necessary, c->ro_ref guards against the filesystem being
- * unmounted - we only take this to avoid a warning in
- * sync_filesystem:
- */
- down_read(&sb->s_umount);
- }
-
- down_write(&c->state_lock);
- struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
- if (!ca)
- goto unlock;
-
- bool dev = bch2_dev_state_allowed(c, ca,
- BCH_MEMBER_STATE_failed,
- BCH_FORCE_IF_DEGRADED);
-
- if (!dev && sb) {
- if (!surprise)
- sync_filesystem(sb);
- shrink_dcache_sb(sb);
- evict_inodes(sb);
- }
-
- struct printbuf buf = PRINTBUF;
- __bch2_log_msg_start(ca->name, &buf);
-
- prt_printf(&buf, "offline from block layer");
-
- if (dev) {
- __bch2_dev_offline(c, ca);
- } else {
- bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only2(c, &buf);
- }
-
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- bch2_dev_put(ca);
-unlock:
- if (sb)
- up_read(&sb->s_umount);
- up_write(&c->state_lock);
- bch2_ro_ref_put(c);
-}
-
-static void bch2_fs_bdev_sync(struct block_device *bdev)
-{
- struct bch_fs *c = bdev_get_fs(bdev);
- if (!c)
- return;
-
- struct super_block *sb = c->vfs_sb;
- if (sb) {
- /*
- * Not necessary, c->ro_ref guards against the filesystem being
- * unmounted - we only take this to avoid a warning in
- * sync_filesystem:
- */
- down_read(&sb->s_umount);
- sync_filesystem(sb);
- up_read(&sb->s_umount);
- }
-
- bch2_ro_ref_put(c);
-}
-
-const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
- .mark_dead = bch2_fs_bdev_mark_dead,
- .sync = bch2_fs_bdev_sync,
-};
-
-/* Filesystem open: */
-
-static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
-{
- return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
- cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
-}
-
-struct bch_fs *bch2_fs_open(darray_const_str *devices,
- struct bch_opts *opts)
-{
- bch_sb_handles sbs = {};
- struct bch_fs *c = NULL;
- struct bch_sb_handle *best = NULL;
- struct printbuf errbuf = PRINTBUF;
- int ret = 0;
-
- if (!try_module_get(THIS_MODULE))
- return ERR_PTR(-ENODEV);
-
- if (!devices->nr) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = darray_make_room(&sbs, devices->nr);
- if (ret)
- goto err;
-
- darray_for_each(*devices, i) {
- struct bch_sb_handle sb = { NULL };
-
- ret = bch2_read_super(*i, opts, &sb);
- if (ret)
- goto err;
-
- BUG_ON(darray_push(&sbs, sb));
- }
-
- if (opts->nochanges && !opts->read_only) {
- ret = bch_err_throw(c, erofs_nochanges);
- goto err_print;
- }
-
- darray_for_each(sbs, sb)
- if (!best || sb_cmp(sb->sb, best->sb) > 0)
- best = sb;
-
- darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb, opts);
-
- if (ret == -BCH_ERR_device_has_been_removed ||
- ret == -BCH_ERR_device_splitbrain) {
- bch2_free_super(sb);
- darray_remove_item(&sbs, sb);
- best -= best > sb;
- ret = 0;
- continue;
- }
-
- if (ret)
- goto err_print;
- }
-
- c = bch2_fs_alloc(best->sb, opts, &sbs);
- ret = PTR_ERR_OR_ZERO(c);
- if (ret)
- goto err;
-
- down_write(&c->state_lock);
- darray_for_each(sbs, sb) {
- ret = bch2_dev_attach_bdev(c, sb);
- if (ret) {
- up_write(&c->state_lock);
- goto err;
- }
- }
- up_write(&c->state_lock);
-
- if (!c->opts.nostart) {
- ret = bch2_fs_start(c);
- if (ret)
- goto err;
- }
-out:
- darray_for_each(sbs, sb)
- bch2_free_super(sb);
- darray_exit(&sbs);
- printbuf_exit(&errbuf);
- module_put(THIS_MODULE);
- return c;
-err_print:
- pr_err("bch_fs_open err opening %s: %s",
- devices->data[0], bch2_err_str(ret));
-err:
- if (!IS_ERR_OR_NULL(c))
- bch2_fs_stop(c);
- c = ERR_PTR(ret);
- goto out;
-}
-
-/* Global interfaces/init */
-
-static void bcachefs_exit(void)
-{
- bch2_debug_exit();
- bch2_vfs_exit();
- bch2_chardev_exit();
- bch2_btree_key_cache_exit();
- if (bcachefs_kset)
- kset_unregister(bcachefs_kset);
-}
-
-static int __init bcachefs_init(void)
-{
- bch2_bkey_pack_test();
-
- if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
- bch2_btree_key_cache_init() ||
- bch2_chardev_init() ||
- bch2_vfs_init() ||
- bch2_debug_init())
- goto err;
-
- return 0;
-err:
- bcachefs_exit();
- return -ENOMEM;
-}
-
-#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name);
-BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp)
-{
- /* Match bool exactly, by re-using it. */
- struct static_key *key = kp->arg;
- struct kernel_param boolkp = *kp;
- bool v;
- int ret;
-
- boolkp.arg = &v;
-
- ret = param_set_bool(val, &boolkp);
- if (ret)
- return ret;
- if (v)
- static_key_enable(key);
- else
- static_key_disable(key);
- return 0;
-}
-
-static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp)
-{
- struct static_key *key = kp->arg;
- return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y');
-}
-
-static const struct kernel_param_ops bch2_param_ops_static_key_t = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = bch2_param_set_static_key_t,
- .get = bch2_param_get_static_key_t,
-};
-
-#define BCH_DEBUG_PARAM(name, description) \
- module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\
- __MODULE_PARM_TYPE(name, "static_key_t"); \
- MODULE_PARM_DESC(name, description);
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-__maybe_unused
-static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
-module_param_named(version, bch2_metadata_version, uint, 0444);
-
-module_exit(bcachefs_exit);
-module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
deleted file mode 100644
index e90bab9afe78..000000000000
--- a/fs/bcachefs/super.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_H
-#define _BCACHEFS_SUPER_H
-
-#include "extents.h"
-
-#include "bcachefs_ioctl.h"
-
-#include <linux/math64.h>
-
-extern const char * const bch2_fs_flag_strs[];
-extern const char * const bch2_write_refs[];
-extern const char * const bch2_dev_read_refs[];
-extern const char * const bch2_dev_write_refs[];
-
-struct bch_fs *bch2_dev_to_fs(dev_t);
-struct bch_fs *bch2_uuid_to_fs(__uuid_t);
-
-bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-
-int bch2_dev_fail(struct bch_dev *, int);
-int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_add(struct bch_fs *, const char *);
-int bch2_dev_online(struct bch_fs *, const char *);
-int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
-struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-
-bool bch2_fs_emergency_read_only(struct bch_fs *);
-bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *);
-
-bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
-void bch2_fs_read_only(struct bch_fs *);
-
-int bch2_fs_read_write(struct bch_fs *);
-int bch2_fs_read_write_early(struct bch_fs *);
-
-int bch2_fs_resize_on_mount(struct bch_fs *);
-
-void __bch2_fs_stop(struct bch_fs *);
-void bch2_fs_free(struct bch_fs *);
-void bch2_fs_stop(struct bch_fs *);
-
-int bch2_fs_init_rw(struct bch_fs *);
-int bch2_fs_start(struct bch_fs *);
-struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
-
-extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
-
-#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
deleted file mode 100644
index 3a899f799d1d..000000000000
--- a/fs/bcachefs/super_types.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SUPER_TYPES_H
-#define _BCACHEFS_SUPER_TYPES_H
-
-struct bch_fs;
-
-struct bch_sb_handle_holder {
- struct bch_fs *c;
-};
-
-struct bch_sb_handle {
- struct bch_sb *sb;
- struct file *s_bdev_file;
- struct block_device *bdev;
- char *sb_name;
- struct bio *bio;
- struct bch_sb_handle_holder *holder;
- size_t buffer_size;
- blk_mode_t mode;
- unsigned have_layout:1;
- unsigned have_bio:1;
- unsigned fs_sb:1;
- u64 seq;
-};
-
-struct bch_devs_mask {
- unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
-};
-
-struct bch_devs_list {
- u8 nr;
- u8 data[BCH_BKEY_PTRS_MAX];
-};
-
-#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
deleted file mode 100644
index 05848375cea2..000000000000
--- a/fs/bcachefs/sysfs.c
+++ /dev/null
@@ -1,914 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bcache sysfs interfaces
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#ifndef NO_BCACHEFS_SYSFS
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "sysfs.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "compress.h"
-#include "disk_accounting.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "enumerated_ref.h"
-#include "error.h"
-#include "inode.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-#include "nocow_locking.h"
-#include "opts.h"
-#include "rebalance.h"
-#include "recovery_passes.h"
-#include "replicas.h"
-#include "sb-errors.h"
-#include "super-io.h"
-#include "tests.h"
-
-#include <linux/blkdev.h>
-#include <linux/sort.h>
-#include <linux/sched/clock.h>
-
-#include "util.h"
-
-#define SYSFS_OPS(type) \
-const struct sysfs_ops type ## _sysfs_ops = { \
- .show = type ## _show, \
- .store = type ## _store \
-}
-
-#define SHOW(fn) \
-static ssize_t fn ## _to_text(struct printbuf *, \
- struct kobject *, struct attribute *); \
- \
-static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
- char *buf) \
-{ \
- struct printbuf out = PRINTBUF; \
- ssize_t ret = fn ## _to_text(&out, kobj, attr); \
- \
- if (out.pos && out.buf[out.pos - 1] != '\n') \
- prt_newline(&out); \
- \
- if (!ret && out.allocation_failure) \
- ret = -ENOMEM; \
- \
- if (!ret) { \
- ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \
- memcpy(buf, out.buf, ret); \
- } \
- printbuf_exit(&out); \
- return bch2_err_class(ret); \
-} \
- \
-static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
- struct attribute *attr)
-
-#define STORE(fn) \
-static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
- const char *, size_t); \
- \
-static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
- const char *buf, size_t size) \
-{ \
- return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
-} \
- \
-static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
- const char *buf, size_t size)
-
-#define __sysfs_attribute(_name, _mode) \
- static struct attribute sysfs_##_name = \
- { .name = #_name, .mode = _mode }
-
-#define write_attribute(n) __sysfs_attribute(n, 0200)
-#define read_attribute(n) __sysfs_attribute(n, 0444)
-#define rw_attribute(n) __sysfs_attribute(n, 0644)
-
-#define sysfs_printf(file, fmt, ...) \
-do { \
- if (attr == &sysfs_ ## file) \
- prt_printf(out, fmt "\n", __VA_ARGS__); \
-} while (0)
-
-#define sysfs_print(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- snprint(out, var); \
-} while (0)
-
-#define sysfs_hprint(file, val) \
-do { \
- if (attr == &sysfs_ ## file) \
- prt_human_readable_s64(out, val); \
-} while (0)
-
-#define sysfs_strtoul(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe(buf, var) ?: (ssize_t) size; \
-} while (0)
-
-#define sysfs_strtoul_clamp(file, var, min, max) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe_clamp(buf, var, min, max) \
- ?: (ssize_t) size; \
-} while (0)
-
-#define strtoul_or_return(cp) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (_r) \
- return _r; \
- _v; \
-})
-
-write_attribute(trigger_gc);
-write_attribute(trigger_discards);
-write_attribute(trigger_invalidates);
-write_attribute(trigger_journal_commit);
-write_attribute(trigger_journal_flush);
-write_attribute(trigger_journal_writes);
-write_attribute(trigger_btree_cache_shrink);
-write_attribute(trigger_btree_key_cache_shrink);
-write_attribute(trigger_btree_updates);
-write_attribute(trigger_freelist_wakeup);
-write_attribute(trigger_recalc_capacity);
-write_attribute(trigger_delete_dead_snapshots);
-write_attribute(trigger_emergency_read_only);
-read_attribute(gc_gens_pos);
-
-read_attribute(uuid);
-read_attribute(minor);
-read_attribute(flags);
-read_attribute(first_bucket);
-read_attribute(nbuckets);
-read_attribute(io_done);
-read_attribute(io_errors);
-write_attribute(io_errors_reset);
-
-read_attribute(io_latency_read);
-read_attribute(io_latency_write);
-read_attribute(io_latency_stats_read);
-read_attribute(io_latency_stats_write);
-read_attribute(congested);
-
-read_attribute(btree_write_stats);
-
-read_attribute(btree_cache_size);
-read_attribute(compression_stats);
-read_attribute(errors);
-read_attribute(journal_debug);
-read_attribute(btree_cache);
-read_attribute(btree_key_cache);
-read_attribute(btree_reserve_cache);
-read_attribute(open_buckets);
-read_attribute(open_buckets_partial);
-read_attribute(nocow_lock_table);
-
-read_attribute(read_refs);
-read_attribute(write_refs);
-
-read_attribute(internal_uuid);
-read_attribute(disk_groups);
-
-read_attribute(has_data);
-read_attribute(alloc_debug);
-read_attribute(usage_base);
-
-#define x(t, n, ...) read_attribute(t);
-BCH_PERSISTENT_COUNTERS()
-#undef x
-
-rw_attribute(label);
-
-read_attribute(copy_gc_wait);
-
-sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_status);
-read_attribute(snapshot_delete_status);
-read_attribute(recovery_status);
-
-read_attribute(new_stripes);
-
-read_attribute(io_timers_read);
-read_attribute(io_timers_write);
-
-read_attribute(moving_ctxts);
-
-#ifdef CONFIG_BCACHEFS_TESTS
-write_attribute(perf_test);
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#define x(_name) \
- static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = 0644 };
- BCH_TIME_STATS()
-#undef x
-
-static size_t bch2_btree_cache_size(struct bch_fs *c)
-{
- struct btree_cache *bc = &c->btree_cache;
- size_t ret = 0;
- struct btree *b;
-
- mutex_lock(&bc->lock);
- list_for_each_entry(b, &bc->live[0].list, list)
- ret += btree_buf_bytes(b);
- list_for_each_entry(b, &bc->live[1].list, list)
- ret += btree_buf_bytes(b);
- list_for_each_entry(b, &bc->freeable, list)
- ret += btree_buf_bytes(b);
- mutex_unlock(&bc->lock);
- return ret;
-}
-
-static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
-{
- prt_str(out, "type");
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 24);
- prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
-
- for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
- struct disk_accounting_pos a;
- disk_accounting_key_init(a, compression, .type = i);
- struct bpos p = disk_accounting_pos_to_bpos(&a);
- u64 v[3];
- bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
-
- u64 nr_extents = v[0];
- u64 sectors_uncompressed = v[1];
- u64 sectors_compressed = v[2];
-
- bch2_prt_compression_type(out, i);
- prt_tab(out);
-
- prt_human_readable_u64(out, sectors_compressed << 9);
- prt_tab_rjust(out);
-
- prt_human_readable_u64(out, sectors_uncompressed << 9);
- prt_tab_rjust(out);
-
- prt_human_readable_u64(out, nr_extents
- ? div64_u64(sectors_uncompressed << 9, nr_extents)
- : 0);
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- return 0;
-}
-
-static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
-{
- bch2_btree_id_to_text(out, c->gc_gens_btree);
- prt_printf(out, ": ");
- bch2_bpos_to_text(out, c->gc_gens_pos);
- prt_printf(out, "\n");
-}
-
-static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct bch_fs_usage_base b = {};
-
- acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
-
- prt_printf(out, "hidden:\t\t%llu\n", b.hidden);
- prt_printf(out, "btree:\t\t%llu\n", b.btree);
- prt_printf(out, "data:\t\t%llu\n", b.data);
- prt_printf(out, "cached:\t%llu\n", b.cached);
- prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
- prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
-}
-
-SHOW(bch2_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- sysfs_print(minor, c->minor);
- sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
-
- if (attr == &sysfs_flags)
- prt_bitflags(out, bch2_fs_flag_strs, c->flags);
-
- sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
-
- if (attr == &sysfs_btree_write_stats)
- bch2_btree_write_stats_to_text(out, c);
-
- if (attr == &sysfs_gc_gens_pos)
- bch2_gc_gens_pos_to_text(out, c);
-
- sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
-
- if (attr == &sysfs_copy_gc_wait)
- bch2_copygc_wait_to_text(out, c);
-
- if (attr == &sysfs_rebalance_status)
- bch2_rebalance_status_to_text(out, c);
-
- if (attr == &sysfs_snapshot_delete_status)
- bch2_snapshot_delete_status_to_text(out, c);
-
- if (attr == &sysfs_recovery_status)
- bch2_recovery_pass_status_to_text(out, c);
-
- /* Debugging: */
-
- if (attr == &sysfs_journal_debug)
- bch2_journal_debug_to_text(out, &c->journal);
-
- if (attr == &sysfs_btree_cache)
- bch2_btree_cache_to_text(out, &c->btree_cache);
-
- if (attr == &sysfs_btree_key_cache)
- bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
-
- if (attr == &sysfs_btree_reserve_cache)
- bch2_btree_reserve_cache_to_text(out, c);
-
- if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c, NULL);
-
- if (attr == &sysfs_open_buckets_partial)
- bch2_open_buckets_partial_to_text(out, c);
-
- if (attr == &sysfs_compression_stats)
- bch2_compression_stats_to_text(out, c);
-
- if (attr == &sysfs_errors)
- bch2_fs_errors_to_text(out, c);
-
- if (attr == &sysfs_new_stripes)
- bch2_new_stripes_to_text(out, c);
-
- if (attr == &sysfs_io_timers_read)
- bch2_io_timers_to_text(out, &c->io_clock[READ]);
-
- if (attr == &sysfs_io_timers_write)
- bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
-
- if (attr == &sysfs_moving_ctxts)
- bch2_fs_moving_ctxts_to_text(out, c);
-
- if (attr == &sysfs_write_refs)
- enumerated_ref_to_text(out, &c->writes, bch2_write_refs);
-
- if (attr == &sysfs_nocow_lock_table)
- bch2_nocow_locks_to_text(out, &c->nocow_locks);
-
- if (attr == &sysfs_disk_groups)
- bch2_disk_groups_to_text(out, c);
-
- if (attr == &sysfs_alloc_debug)
- bch2_fs_alloc_debug_to_text(out, c);
-
- if (attr == &sysfs_usage_base)
- bch2_fs_usage_base_to_text(out, c);
-
- return 0;
-}
-
-STORE(bch2_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
-
- /* Debugging: */
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EPERM;
-
- /* Debugging: */
-
- if (attr == &sysfs_trigger_btree_updates)
- queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
-
- if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))
- return -EROFS;
-
- if (attr == &sysfs_trigger_btree_cache_shrink) {
- struct btree_cache *bc = &c->btree_cache;
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
- }
-
- if (attr == &sysfs_trigger_btree_key_cache_shrink) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
- }
-
- if (attr == &sysfs_trigger_gc)
- bch2_gc_gens(c);
-
- if (attr == &sysfs_trigger_discards)
- bch2_do_discards(c);
-
- if (attr == &sysfs_trigger_invalidates)
- bch2_do_invalidates(c);
-
- if (attr == &sysfs_trigger_journal_commit)
- bch2_journal_flush(&c->journal);
-
- if (attr == &sysfs_trigger_journal_flush) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_meta(&c->journal);
- }
-
- if (attr == &sysfs_trigger_journal_writes)
- bch2_journal_do_writes(&c->journal);
-
- if (attr == &sysfs_trigger_freelist_wakeup)
- closure_wake_up(&c->freelist_wait);
-
- if (attr == &sysfs_trigger_recalc_capacity) {
- down_read(&c->state_lock);
- bch2_recalc_capacity(c);
- up_read(&c->state_lock);
- }
-
- if (attr == &sysfs_trigger_delete_dead_snapshots)
- __bch2_delete_dead_snapshots(c);
-
- if (attr == &sysfs_trigger_emergency_read_only) {
- struct printbuf buf = PRINTBUF;
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "shutdown by sysfs\n");
- bch2_fs_emergency_read_only2(c, &buf);
- bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
-
-#ifdef CONFIG_BCACHEFS_TESTS
- if (attr == &sysfs_perf_test) {
- char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
- char *test = strsep(&p, " \t\n");
- char *nr_str = strsep(&p, " \t\n");
- char *threads_str = strsep(&p, " \t\n");
- unsigned threads;
- u64 nr;
- int ret = -EINVAL;
-
- if (threads_str &&
- !(ret = kstrtouint(threads_str, 10, &threads)) &&
- !(ret = bch2_strtoull_h(nr_str, &nr)))
- ret = bch2_btree_perf_test(c, test, nr, threads);
- kfree(tmp);
-
- if (ret)
- size = ret;
- }
-#endif
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
- return size;
-}
-SYSFS_OPS(bch2_fs);
-
-struct attribute *bch2_fs_files[] = {
- &sysfs_minor,
- &sysfs_btree_cache_size,
- &sysfs_btree_write_stats,
-
- &sysfs_rebalance_status,
- &sysfs_snapshot_delete_status,
- &sysfs_recovery_status,
-
- &sysfs_compression_stats,
- &sysfs_errors,
-
-#ifdef CONFIG_BCACHEFS_TESTS
- &sysfs_perf_test,
-#endif
- NULL
-};
-
-/* counters dir */
-
-SHOW(bch2_fs_counters)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
- u64 counter = 0;
- u64 counter_since_mount = 0;
-
- printbuf_tabstop_push(out, 32);
-
- #define x(t, n, f, ...) \
- if (attr == &sysfs_##t) { \
- counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
- counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
- if (f & TYPE_SECTORS) { \
- counter <<= 9; \
- counter_since_mount <<= 9; \
- } \
- \
- prt_printf(out, "since mount:\t"); \
- (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\
- prt_human_readable_u64(out, counter_since_mount); \
- prt_newline(out); \
- \
- prt_printf(out, "since filesystem creation:\t"); \
- (f & TYPE_COUNTER) ? prt_u64(out, counter) : \
- prt_human_readable_u64(out, counter); \
- prt_newline(out); \
- }
- BCH_PERSISTENT_COUNTERS()
- #undef x
- return 0;
-}
-
-STORE(bch2_fs_counters) {
- return 0;
-}
-
-SYSFS_OPS(bch2_fs_counters);
-
-struct attribute *bch2_fs_counters_files[] = {
-#define x(t, ...) \
- &sysfs_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- NULL
-};
-/* internal dir - just a wrapper */
-
-SHOW(bch2_fs_internal)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
- return bch2_fs_to_text(out, &c->kobj, attr);
-}
-
-STORE(bch2_fs_internal)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-
- return bch2_fs_store(&c->kobj, attr, buf, size);
-}
-SYSFS_OPS(bch2_fs_internal);
-
-struct attribute *bch2_fs_internal_files[] = {
- &sysfs_flags,
- &sysfs_journal_debug,
- &sysfs_btree_cache,
- &sysfs_btree_key_cache,
- &sysfs_btree_reserve_cache,
- &sysfs_new_stripes,
- &sysfs_open_buckets,
- &sysfs_open_buckets_partial,
- &sysfs_write_refs,
- &sysfs_nocow_lock_table,
- &sysfs_io_timers_read,
- &sysfs_io_timers_write,
-
- &sysfs_trigger_gc,
- &sysfs_trigger_discards,
- &sysfs_trigger_invalidates,
- &sysfs_trigger_journal_commit,
- &sysfs_trigger_journal_flush,
- &sysfs_trigger_journal_writes,
- &sysfs_trigger_btree_cache_shrink,
- &sysfs_trigger_btree_key_cache_shrink,
- &sysfs_trigger_btree_updates,
- &sysfs_trigger_freelist_wakeup,
- &sysfs_trigger_recalc_capacity,
- &sysfs_trigger_delete_dead_snapshots,
- &sysfs_trigger_emergency_read_only,
-
- &sysfs_gc_gens_pos,
-
- &sysfs_copy_gc_wait,
-
- sysfs_pd_controller_files(rebalance),
-
- &sysfs_moving_ctxts,
-
- &sysfs_internal_uuid,
-
- &sysfs_disk_groups,
- &sysfs_alloc_debug,
- &sysfs_usage_base,
- NULL
-};
-
-/* options */
-
-static ssize_t sysfs_opt_show(struct bch_fs *c,
- struct bch_dev *ca,
- enum bch_opt_id id,
- struct printbuf *out)
-{
- const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
-
- if (opt->flags & OPT_FS) {
- v = bch2_opt_get_by_id(&c->opts, id);
- } else if ((opt->flags & OPT_DEVICE) && opt->get_member) {
- v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
- } else {
- return -EINVAL;
- }
-
- bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
- prt_char(out, '\n');
- return 0;
-}
-
-static ssize_t sysfs_opt_store(struct bch_fs *c,
- struct bch_dev *ca,
- enum bch_opt_id id,
- const char *buf, size_t size)
-{
- const struct bch_option *opt = bch2_opt_table + id;
- int ret = 0;
-
- /*
- * We don't need to take c->writes for correctness, but it eliminates an
- * unsightly error message in the dmesg log when we're RO:
- */
- if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)))
- return -EROFS;
-
- char *tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto err;
- }
-
- u64 v;
- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
- bch2_opt_hook_pre_set(c, ca, id, v);
- kfree(tmp);
-
- if (ret < 0)
- goto err;
-
- bool is_sb = opt->get_sb || opt->get_member;
- bool changed = false;
-
- if (is_sb) {
- changed = bch2_opt_set_sb(c, ca, opt, v);
- } else if (!ca) {
- changed = bch2_opt_get_by_id(&c->opts, id) != v;
- } else {
- /* device options that aren't superblock options aren't
- * supported */
- BUG();
- }
-
- if (!ca)
- bch2_opt_set_by_id(&c->opts, id, v);
-
- if (changed)
- bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
-
- ret = size;
-err:
- enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
- return ret;
-}
-
-SHOW(bch2_fs_opts_dir)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- int id = bch2_opt_lookup(attr->name);
- if (id < 0)
- return 0;
-
- return sysfs_opt_show(c, NULL, id, out);
-}
-
-STORE(bch2_fs_opts_dir)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- int id = bch2_opt_lookup(attr->name);
- if (id < 0)
- return 0;
-
- return sysfs_opt_store(c, NULL, id, buf, size);
-}
-SYSFS_OPS(bch2_fs_opts_dir);
-
-struct attribute *bch2_fs_opts_dir_files[] = { NULL };
-
-int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{
- for (const struct bch_option *i = bch2_opt_table;
- i < bch2_opt_table + bch2_opts_nr;
- i++) {
- if (i->flags & OPT_HIDDEN)
- continue;
- if (!(i->flags & type))
- continue;
-
- int ret = sysfs_create_file(kobj, &i->attr);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* time stats */
-
-SHOW(bch2_fs_time_stats)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name) \
- if (attr == &sysfs_time_stat_##name) \
- bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
- BCH_TIME_STATS()
-#undef x
-
- return 0;
-}
-
-STORE(bch2_fs_time_stats)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define x(name) \
- if (attr == &sysfs_time_stat_##name) \
- bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
- BCH_TIME_STATS()
-#undef x
- return size;
-}
-SYSFS_OPS(bch2_fs_time_stats);
-
-struct attribute *bch2_fs_time_stats_files[] = {
-#define x(name) \
- &sysfs_time_stat_##name,
- BCH_TIME_STATS()
-#undef x
- NULL
-};
-
-static const char * const bch2_rw[] = {
- "read",
- "write",
- NULL
-};
-
-static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- int rw, i;
-
- for (rw = 0; rw < 2; rw++) {
- prt_printf(out, "%s:\n", bch2_rw[rw]);
-
- for (i = 1; i < BCH_DATA_NR; i++)
- prt_printf(out, "%-12s:%12llu\n",
- bch2_data_type_str(i),
- percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
- }
-}
-
-SHOW(bch2_dev)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
- struct bch_fs *c = ca->fs;
-
- sysfs_printf(uuid, "%pU\n", ca->uuid.b);
-
- sysfs_print(first_bucket, ca->mi.first_bucket);
- sysfs_print(nbuckets, ca->mi.nbuckets);
-
- if (attr == &sysfs_label) {
- if (ca->mi.group)
- bch2_disk_path_to_text(out, c, ca->mi.group - 1);
- prt_char(out, '\n');
- }
-
- if (attr == &sysfs_has_data) {
- prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
- prt_char(out, '\n');
- }
-
- if (attr == &sysfs_io_done)
- dev_io_done_to_text(out, ca);
-
- if (attr == &sysfs_io_errors)
- bch2_dev_io_errors_to_text(out, ca);
-
- sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
- sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
-
- if (attr == &sysfs_io_latency_stats_read)
- bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
-
- if (attr == &sysfs_io_latency_stats_write)
- bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
-
- sysfs_printf(congested, "%u%%",
- clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
- * 100 / CONGESTED_MAX);
-
- if (attr == &sysfs_alloc_debug)
- bch2_dev_alloc_debug_to_text(out, ca);
-
- if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c, ca);
-
- int opt_id = bch2_opt_lookup(attr->name);
- if (opt_id >= 0)
- return sysfs_opt_show(c, ca, opt_id, out);
-
- if (attr == &sysfs_read_refs)
- enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs);
-
- if (attr == &sysfs_write_refs)
- enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs);
-
- return 0;
-}
-
-STORE(bch2_dev)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
- struct bch_fs *c = ca->fs;
-
- if (attr == &sysfs_label) {
- char *tmp;
- int ret;
-
- tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
-
- ret = bch2_dev_group_set(c, ca, strim(tmp));
- kfree(tmp);
- if (ret)
- return ret;
- }
-
- if (attr == &sysfs_io_errors_reset)
- bch2_dev_errors_reset(ca);
-
- int opt_id = bch2_opt_lookup(attr->name);
- if (opt_id >= 0)
- return sysfs_opt_store(c, ca, opt_id, buf, size);
-
- return size;
-}
-SYSFS_OPS(bch2_dev);
-
-struct attribute *bch2_dev_files[] = {
- &sysfs_uuid,
- &sysfs_first_bucket,
- &sysfs_nbuckets,
-
- /* settings: */
- &sysfs_label,
-
- &sysfs_has_data,
- &sysfs_io_done,
- &sysfs_io_errors,
- &sysfs_io_errors_reset,
-
- &sysfs_io_latency_read,
- &sysfs_io_latency_write,
- &sysfs_io_latency_stats_read,
- &sysfs_io_latency_stats_write,
- &sysfs_congested,
-
- /* debug: */
- &sysfs_alloc_debug,
- &sysfs_open_buckets,
-
- &sysfs_read_refs,
- &sysfs_write_refs,
- NULL
-};
-
-#endif /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
deleted file mode 100644
index 303e0433c702..000000000000
--- a/fs/bcachefs/sysfs.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_SYSFS_H_
-#define _BCACHEFS_SYSFS_H_
-
-#include <linux/sysfs.h>
-
-#ifndef NO_BCACHEFS_SYSFS
-
-struct attribute;
-struct sysfs_ops;
-
-extern struct attribute *bch2_fs_files[];
-extern struct attribute *bch2_fs_counters_files[];
-extern struct attribute *bch2_fs_internal_files[];
-extern struct attribute *bch2_fs_opts_dir_files[];
-extern struct attribute *bch2_fs_time_stats_files[];
-extern struct attribute *bch2_dev_files[];
-
-extern const struct sysfs_ops bch2_fs_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern const struct sysfs_ops bch2_dev_sysfs_ops;
-
-int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
-
-#else
-
-static struct attribute *bch2_fs_files[] = {};
-static struct attribute *bch2_fs_counters_files[] = {};
-static struct attribute *bch2_fs_internal_files[] = {};
-static struct attribute *bch2_fs_opts_dir_files[] = {};
-static struct attribute *bch2_fs_time_stats_files[] = {};
-static struct attribute *bch2_dev_files[] = {};
-
-static const struct sysfs_ops bch2_fs_sysfs_ops;
-static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
-static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
-static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-static const struct sysfs_ops bch2_dev_sysfs_ops;
-
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
-{ return 0; }
-
-#endif /* NO_BCACHEFS_SYSFS */
-
-#endif /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
deleted file mode 100644
index 782a05fe7656..000000000000
--- a/fs/bcachefs/tests.c
+++ /dev/null
@@ -1,891 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_TESTS
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "journal_reclaim.h"
-#include "snapshot.h"
-#include "tests.h"
-
-#include "linux/kthread.h"
-#include "linux/random.h"
-
-static void delete_test_keys(struct bch_fs *c)
-{
- int ret;
-
- ret = bch2_btree_delete_range(c, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX),
- POS(0, U64_MAX),
- 0, NULL);
- BUG_ON(ret);
-
- ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- POS(0, U64_MAX),
- 0, NULL);
- BUG_ON(ret);
-}
-
-/* unit tests */
-
-static int test_delete(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k.p.snapshot = U32_MAX;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_intent);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, &k.k_i, 0));
- bch_err_msg(c, ret, "update error");
- if (ret)
- goto err;
-
- pr_info("deleting once");
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_btree_delete_at(trans, &iter, 0));
- bch_err_msg(c, ret, "delete error (first)");
- if (ret)
- goto err;
-
- pr_info("deleting twice");
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_btree_delete_at(trans, &iter, 0));
- bch_err_msg(c, ret, "delete error (second)");
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int test_delete_written(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k.p.snapshot = U32_MAX;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_intent);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, &k.k_i, 0));
- bch_err_msg(c, ret, "update error");
- if (ret)
- goto err;
-
- bch2_trans_unlock(trans);
- bch2_journal_flush_all_pins(&c->journal);
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_btree_delete_at(trans, &iter, 0));
- bch_err_msg(c, ret, "delete error");
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int test_iterate(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test keys");
-
- for (i = 0; i < nr; i++) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i;
- ck.k.p.snapshot = U32_MAX;
-
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i++);
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr);
-
- pr_info("iterating backwards");
-
- ret = bch2_trans_run(c,
- for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
- SPOS(0, U64_MAX, U32_MAX), 0, k, ({
- BUG_ON(k.k->p.offset != --i);
- 0;
- })));
- bch_err_msg(c, ret, "error iterating backwards");
- if (ret)
- return ret;
-
- BUG_ON(i);
- return 0;
-}
-
-static int test_iterate_extents(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test extents");
-
- for (i = 0; i < nr; i += 8) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i + 8;
- ck.k.p.snapshot = U32_MAX;
- ck.k.size = 8;
-
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i);
- i = k.k->p.offset;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr);
-
- pr_info("iterating backwards");
-
- ret = bch2_trans_run(c,
- for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
- SPOS(0, U64_MAX, U32_MAX), 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i = bkey_start_offset(k.k);
- 0;
- })));
- bch_err_msg(c, ret, "error iterating backwards");
- if (ret)
- return ret;
-
- BUG_ON(i);
- return 0;
-}
-
-static int test_iterate_slots(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test keys");
-
- for (i = 0; i < nr; i++) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i * 2;
- ck.k.p.snapshot = U32_MAX;
-
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i += 2;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr * 2);
-
- pr_info("iterating forwards by slots");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_slots, k, ({
- if (i >= nr * 2)
- break;
-
- BUG_ON(k.k->p.offset != i);
- BUG_ON(bkey_deleted(k.k) != (i & 1));
-
- i++;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards by slots");
- return ret;
-}
-
-static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
-{
- u64 i;
- int ret = 0;
-
- delete_test_keys(c);
-
- pr_info("inserting test keys");
-
- for (i = 0; i < nr; i += 16) {
- struct bkey_i_cookie ck;
-
- bkey_cookie_init(&ck.k_i);
- ck.k.p.offset = i + 16;
- ck.k.p.snapshot = U32_MAX;
- ck.k.size = 8;
-
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
- bch_err_msg(c, ret, "insert error");
- if (ret)
- return ret;
- }
-
- pr_info("iterating forwards");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i + 8);
- BUG_ON(k.k->size != 8);
- i += 16;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards");
- if (ret)
- return ret;
-
- BUG_ON(i != nr);
-
- pr_info("iterating forwards by slots");
- i = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_slots, k, ({
- if (i == nr)
- break;
- BUG_ON(bkey_deleted(k.k) != !(i % 16));
-
- BUG_ON(bkey_start_offset(k.k) != i);
- BUG_ON(k.k->size != 8);
- i = k.k->p.offset;
- 0;
- })));
- bch_err_msg(c, ret, "error iterating forwards by slots");
- return ret;
-}
-
-/*
- * XXX: we really want to make sure we've got a btree with depth > 0 for these
- * tests
- */
-static int test_peek_end(struct bch_fs *c, u64 nr)
-{
- delete_test_keys(c);
-
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return 0;
-}
-
-static int test_peek_end_extents(struct bch_fs *c, u64 nr)
-{
- delete_test_keys(c);
-
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), 0);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
- BUG_ON(k.k);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return 0;
-}
-
-/* extent unit tests */
-
-static u64 test_version;
-
-static int insert_test_extent(struct bch_fs *c,
- u64 start, u64 end)
-{
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k_i.k.p.offset = end;
- k.k_i.k.p.snapshot = U32_MAX;
- k.k_i.k.size = end - start;
- k.k_i.k.bversion.lo = test_version++;
-
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int __test_extent_overwrite(struct bch_fs *c,
- u64 e1_start, u64 e1_end,
- u64 e2_start, u64 e2_end)
-{
- int ret;
-
- ret = insert_test_extent(c, e1_start, e1_end) ?:
- insert_test_extent(c, e2_start, e2_end);
-
- delete_test_keys(c);
- return ret;
-}
-
-static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 0, 64, 0, 32) ?:
- __test_extent_overwrite(c, 8, 64, 0, 32);
-}
-
-static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 0, 64, 32, 64) ?:
- __test_extent_overwrite(c, 0, 64, 32, 72);
-}
-
-static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 0, 64, 32, 40);
-}
-
-static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
-{
- return __test_extent_overwrite(c, 32, 64, 0, 64) ?:
- __test_extent_overwrite(c, 32, 64, 0, 128) ?:
- __test_extent_overwrite(c, 32, 64, 32, 64) ?:
- __test_extent_overwrite(c, 32, 64, 32, 128);
-}
-
-static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
-{
- struct bkey_i_cookie k;
- int ret;
-
- bkey_cookie_init(&k.k_i);
- k.k_i.k.p.inode = inum;
- k.k_i.k.p.offset = start + len;
- k.k_i.k.p.snapshot = snapid;
- k.k_i.k.size = len;
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
- BTREE_UPDATE_internal_snapshot_node));
- bch_err_fn(c, ret);
- return ret;
-}
-
-static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
-{
- return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */
- insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?:
- insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?:
- insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */
- insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?:
- insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?:
- insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX);
-}
-
-/* snapshot unit tests */
-
-/* Test skipping over keys in unrelated snapshots: */
-static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
-{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i_cookie cookie;
- int ret;
-
- bkey_cookie_init(&cookie.k_i);
- cookie.k.p.snapshot = snapid_hi;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(c);
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
-
- BUG_ON(k.k->p.snapshot != U32_MAX);
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int test_snapshots(struct bch_fs *c, u64 nr)
-{
- struct bkey_i_cookie cookie;
- u32 snapids[2];
- u32 snapid_subvols[2] = { 1, 1 };
- int ret;
-
- bkey_cookie_init(&cookie.k_i);
- cookie.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
- if (ret)
- return ret;
-
- ret = bch2_trans_commit_do(c, NULL, NULL, 0,
- bch2_snapshot_node_create(trans, U32_MAX,
- snapids,
- snapid_subvols,
- 2));
- if (ret)
- return ret;
-
- if (snapids[0] > snapids[1])
- swap(snapids[0], snapids[1]);
-
- ret = test_snapshot_filter(c, snapids[0], snapids[1]);
- bch_err_msg(c, ret, "from test_snapshot_filter");
- return ret;
-}
-
-/* perf tests */
-
-static u64 test_rand(void)
-{
- u64 v;
-
- get_random_bytes(&v, sizeof(v));
- return v;
-}
-
-static int rand_insert(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bkey_i_cookie k;
- int ret = 0;
- u64 i;
-
- for (i = 0; i < nr; i++) {
- bkey_cookie_init(&k.k_i);
- k.k.p.offset = test_rand();
- k.k.p.snapshot = U32_MAX;
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-static int rand_insert_multi(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bkey_i_cookie k[8];
- int ret = 0;
- unsigned j;
- u64 i;
-
- for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
- for (j = 0; j < ARRAY_SIZE(k); j++) {
- bkey_cookie_init(&k[j].k_i);
- k[j].k.p.offset = test_rand();
- k[j].k.p.snapshot = U32_MAX;
- }
-
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-static int rand_lookup(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
- u64 i;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0);
-
- for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
-
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
- ret = bkey_err(k);
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int rand_mixed_trans(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i_cookie *cookie,
- u64 i, u64 pos)
-{
- struct bkey_s_c k;
- int ret;
-
- bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
-
- k = bch2_btree_iter_peek(trans, iter);
- ret = bkey_err(k);
- bch_err_msg(trans->c, ret, "lookup error");
- if (ret)
- return ret;
-
- if (!(i & 3) && k.k) {
- bkey_cookie_init(&cookie->k_i);
- cookie->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
- }
-
- return ret;
-}
-
-static int rand_mixed(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i_cookie cookie;
- int ret = 0;
- u64 i, rand;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0);
-
- for (i = 0; i < nr; i++) {
- rand = test_rand();
- ret = commit_do(trans, NULL, NULL, 0,
- rand_mixed_trans(trans, &iter, &cookie, i, rand));
- if (ret)
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
-}
-
-static int __do_delete(struct btree_trans *trans, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
- BTREE_ITER_intent);
- k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k)
- goto err;
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int rand_delete(struct bch_fs *c, u64 nr)
-{
- struct btree_trans *trans = bch2_trans_get(c);
- int ret = 0;
- u64 i;
-
- for (i = 0; i < nr; i++) {
- struct bpos pos = SPOS(0, test_rand(), U32_MAX);
-
- ret = commit_do(trans, NULL, NULL, 0,
- __do_delete(trans, pos));
- if (ret)
- break;
- }
-
- bch2_trans_put(trans);
- return ret;
-}
-
-static int seq_insert(struct bch_fs *c, u64 nr)
-{
- struct bkey_i_cookie insert;
-
- bkey_cookie_init(&insert.k_i);
-
- return bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k,
- NULL, NULL, 0, ({
- if (iter.pos.offset >= nr)
- break;
- insert.k.p = iter.pos;
- bch2_trans_update(trans, &iter, &insert.k_i, 0);
- })));
-}
-
-static int seq_lookup(struct bch_fs *c, u64 nr)
-{
- return bch2_trans_run(c,
- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k,
- 0));
-}
-
-static int seq_overwrite(struct bch_fs *c, u64 nr)
-{
- return bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_intent, k,
- NULL, NULL, 0, ({
- struct bkey_i_cookie u;
-
- bkey_reassemble(&u.k_i, k);
- bch2_trans_update(trans, &iter, &u.k_i, 0);
- })));
-}
-
-static int seq_delete(struct bch_fs *c, u64 nr)
-{
- return bch2_btree_delete_range(c, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- POS(0, U64_MAX),
- 0, NULL);
-}
-
-typedef int (*perf_test_fn)(struct bch_fs *, u64);
-
-struct test_job {
- struct bch_fs *c;
- u64 nr;
- unsigned nr_threads;
- perf_test_fn fn;
-
- atomic_t ready;
- wait_queue_head_t ready_wait;
-
- atomic_t done;
- struct completion done_completion;
-
- u64 start;
- u64 finish;
- int ret;
-};
-
-static int btree_perf_test_thread(void *data)
-{
- struct test_job *j = data;
- int ret;
-
- if (atomic_dec_and_test(&j->ready)) {
- wake_up(&j->ready_wait);
- j->start = sched_clock();
- } else {
- wait_event(j->ready_wait, !atomic_read(&j->ready));
- }
-
- ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
- if (ret) {
- bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
- j->ret = ret;
- }
-
- if (atomic_dec_and_test(&j->done)) {
- j->finish = sched_clock();
- complete(&j->done_completion);
- }
-
- return 0;
-}
-
-int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
- u64 nr, unsigned nr_threads)
-{
- struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
- char name_buf[20];
- struct printbuf nr_buf = PRINTBUF;
- struct printbuf per_sec_buf = PRINTBUF;
- unsigned i;
- u64 time;
-
- if (nr == 0 || nr_threads == 0) {
- pr_err("nr of iterations or threads is not allowed to be 0");
- return -EINVAL;
- }
-
- atomic_set(&j.ready, nr_threads);
- init_waitqueue_head(&j.ready_wait);
-
- atomic_set(&j.done, nr_threads);
- init_completion(&j.done_completion);
-
-#define perf_test(_test) \
- if (!strcmp(testname, #_test)) j.fn = _test
-
- perf_test(rand_insert);
- perf_test(rand_insert_multi);
- perf_test(rand_lookup);
- perf_test(rand_mixed);
- perf_test(rand_delete);
-
- perf_test(seq_insert);
- perf_test(seq_lookup);
- perf_test(seq_overwrite);
- perf_test(seq_delete);
-
- /* a unit test, not a perf test: */
- perf_test(test_delete);
- perf_test(test_delete_written);
- perf_test(test_iterate);
- perf_test(test_iterate_extents);
- perf_test(test_iterate_slots);
- perf_test(test_iterate_slots_extents);
- perf_test(test_peek_end);
- perf_test(test_peek_end_extents);
-
- perf_test(test_extent_overwrite_front);
- perf_test(test_extent_overwrite_back);
- perf_test(test_extent_overwrite_middle);
- perf_test(test_extent_overwrite_all);
- perf_test(test_extent_create_overlapping);
-
- perf_test(test_snapshots);
-
- if (!j.fn) {
- pr_err("unknown test %s", testname);
- return -EINVAL;
- }
-
- //pr_info("running test %s:", testname);
-
- if (nr_threads == 1)
- btree_perf_test_thread(&j);
- else
- for (i = 0; i < nr_threads; i++)
- kthread_run(btree_perf_test_thread, &j,
- "bcachefs perf test[%u]", i);
-
- while (wait_for_completion_interruptible(&j.done_completion))
- ;
-
- time = j.finish - j.start;
-
- scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
- prt_human_readable_u64(&nr_buf, nr);
- prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
- printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
- name_buf, nr_buf.buf, nr_threads,
- div_u64(time, NSEC_PER_SEC),
- div_u64(time * nr_threads, nr),
- per_sec_buf.buf);
- printbuf_exit(&per_sec_buf);
- printbuf_exit(&nr_buf);
- return j.ret;
-}
-
-#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
deleted file mode 100644
index c73b18aea7e0..000000000000
--- a/fs/bcachefs/tests.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TEST_H
-#define _BCACHEFS_TEST_H
-
-struct bch_fs;
-
-#ifdef CONFIG_BCACHEFS_TESTS
-
-int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
-
-#else
-
-#endif /* CONFIG_BCACHEFS_TESTS */
-
-#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
deleted file mode 100644
index 314a24d15d4e..000000000000
--- a/fs/bcachefs/thread_with_file.c
+++ /dev/null
@@ -1,494 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef NO_BCACHEFS_FS
-
-#include "bcachefs.h"
-#include "thread_with_file.h"
-
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-#include <linux/poll.h>
-#include <linux/sched/sysctl.h>
-
-void bch2_thread_with_file_exit(struct thread_with_file *thr)
-{
- if (thr->task) {
- kthread_stop(thr->task);
- put_task_struct(thr->task);
- }
-}
-
-int bch2_run_thread_with_file(struct thread_with_file *thr,
- const struct file_operations *fops,
- int (*fn)(void *))
-{
- struct file *file = NULL;
- int ret, fd = -1;
- unsigned fd_flags = O_CLOEXEC;
-
- if (fops->read && fops->write)
- fd_flags |= O_RDWR;
- else if (fops->read)
- fd_flags |= O_RDONLY;
- else if (fops->write)
- fd_flags |= O_WRONLY;
-
- char name[TASK_COMM_LEN];
- get_task_comm(name, current);
-
- thr->ret = 0;
- thr->task = kthread_create(fn, thr, "%s", name);
- ret = PTR_ERR_OR_ZERO(thr->task);
- if (ret)
- return ret;
-
- ret = get_unused_fd_flags(fd_flags);
- if (ret < 0)
- goto err;
- fd = ret;
-
- file = anon_inode_getfile(name, fops, thr, fd_flags);
- ret = PTR_ERR_OR_ZERO(file);
- if (ret)
- goto err;
-
- get_task_struct(thr->task);
- wake_up_process(thr->task);
- fd_install(fd, file);
- return fd;
-err:
- if (fd >= 0)
- put_unused_fd(fd);
- if (thr->task)
- kthread_stop(thr->task);
- return ret;
-}
-
-/* stdio_redirect */
-
-static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
-{
- return stdio->input.buf.nr > seen || stdio->done;
-}
-
-static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
-{
- return stdio_redirect_has_more_input(stdio, 0);
-}
-
-static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
-{
- return stdio->output.buf.nr || stdio->done;
-}
-
-#define STDIO_REDIRECT_BUFSIZE 4096
-
-static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
-{
- return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
-{
- return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
-}
-
-static void stdio_buf_init(struct stdio_buf *buf)
-{
- spin_lock_init(&buf->lock);
- init_waitqueue_head(&buf->wait);
- darray_init(&buf->buf);
-}
-
-/* thread_with_stdio */
-
-static void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
- thr->thr.done = true;
- thr->stdio.done = true;
- wake_up(&thr->stdio.input.wait);
- wake_up(&thr->stdio.output.wait);
-}
-
-static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
- size_t len, loff_t *ppos)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- struct stdio_buf *buf = &thr->stdio.output;
- size_t copied = 0, b;
- int ret = 0;
-
- if (!(file->f_flags & O_NONBLOCK)) {
- ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
- if (ret)
- return ret;
- } else if (!stdio_redirect_has_output(&thr->stdio))
- return -EAGAIN;
-
- while (len && buf->buf.nr) {
- if (fault_in_writeable(ubuf, len) == len) {
- ret = -EFAULT;
- break;
- }
-
- spin_lock_irq(&buf->lock);
- b = min_t(size_t, len, buf->buf.nr);
-
- if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
- ubuf += b;
- len -= b;
- copied += b;
- buf->buf.nr -= b;
- memmove(buf->buf.data,
- buf->buf.data + b,
- buf->buf.nr);
- }
- spin_unlock_irq(&buf->lock);
- }
-
- return copied ?: ret;
-}
-
-static int thread_with_stdio_release(struct inode *inode, struct file *file)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- thread_with_stdio_done(thr);
- bch2_thread_with_file_exit(&thr->thr);
- darray_exit(&thr->stdio.input.buf);
- darray_exit(&thr->stdio.output.buf);
- thr->ops->exit(thr);
- return 0;
-}
-
-static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
- size_t len, loff_t *ppos)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
- struct stdio_buf *buf = &thr->stdio.input;
- size_t copied = 0;
- ssize_t ret = 0;
-
- while (len) {
- if (thr->thr.done) {
- ret = -EPIPE;
- break;
- }
-
- size_t b = len - fault_in_readable(ubuf, len);
- if (!b) {
- ret = -EFAULT;
- break;
- }
-
- spin_lock(&buf->lock);
- size_t makeroom = b;
- if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
- makeroom = min_t(ssize_t, makeroom,
- max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
- 0));
- darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
-
- b = min(len, darray_room(buf->buf));
-
- if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
- buf->buf.nr += b;
- ubuf += b;
- len -= b;
- copied += b;
- }
- spin_unlock(&buf->lock);
-
- if (b) {
- wake_up(&buf->wait);
- } else {
- if ((file->f_flags & O_NONBLOCK)) {
- ret = -EAGAIN;
- break;
- }
-
- ret = wait_event_interruptible(buf->wait,
- stdio_redirect_has_input_space(&thr->stdio));
- if (ret)
- break;
- }
- }
-
- return copied ?: ret;
-}
-
-static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- poll_wait(file, &thr->stdio.output.wait, wait);
- poll_wait(file, &thr->stdio.input.wait, wait);
-
- __poll_t mask = 0;
-
- if (stdio_redirect_has_output(&thr->stdio))
- mask |= EPOLLIN;
- if (stdio_redirect_has_input_space(&thr->stdio))
- mask |= EPOLLOUT;
- if (thr->thr.done)
- mask |= EPOLLHUP|EPOLLERR;
- return mask;
-}
-
-static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- poll_wait(file, &thr->stdio.output.wait, wait);
-
- __poll_t mask = 0;
-
- if (stdio_redirect_has_output(&thr->stdio))
- mask |= EPOLLIN;
- if (thr->thr.done)
- mask |= EPOLLHUP|EPOLLERR;
- return mask;
-}
-
-static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- return thr->thr.ret;
-}
-
-static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
-{
- struct thread_with_stdio *thr =
- container_of(file->private_data, struct thread_with_stdio, thr);
-
- if (thr->ops->unlocked_ioctl)
- return thr->ops->unlocked_ioctl(thr, cmd, p);
- return -ENOTTY;
-}
-
-static const struct file_operations thread_with_stdio_fops = {
- .read = thread_with_stdio_read,
- .write = thread_with_stdio_write,
- .poll = thread_with_stdio_poll,
- .flush = thread_with_stdio_flush,
- .release = thread_with_stdio_release,
- .unlocked_ioctl = thread_with_stdio_ioctl,
-};
-
-static const struct file_operations thread_with_stdout_fops = {
- .read = thread_with_stdio_read,
- .poll = thread_with_stdout_poll,
- .flush = thread_with_stdio_flush,
- .release = thread_with_stdio_release,
- .unlocked_ioctl = thread_with_stdio_ioctl,
-};
-
-static int thread_with_stdio_fn(void *arg)
-{
- struct thread_with_stdio *thr = arg;
-
- thr->thr.ret = thr->ops->fn(thr);
-
- thread_with_stdio_done(thr);
- return 0;
-}
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
-{
- stdio_buf_init(&thr->stdio.input);
- stdio_buf_init(&thr->stdio.output);
- thr->ops = ops;
-}
-
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
-{
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
-}
-
-int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
-{
- bch2_thread_with_stdio_init(thr, ops);
-
- return __bch2_run_thread_with_stdio(thr);
-}
-
-int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
- const struct thread_with_stdio_ops *ops)
-{
- stdio_buf_init(&thr->stdio.input);
- stdio_buf_init(&thr->stdio.output);
- thr->ops = ops;
-
- return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
-}
-EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
-
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
-{
- struct stdio_buf *buf = &stdio->input;
-
- /*
- * we're waiting on user input (or for the file descriptor to be
- * closed), don't want a hung task warning:
- */
- do {
- wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
- sysctl_hung_task_timeout_secs * HZ / 2);
- } while (!stdio_redirect_has_input(stdio));
-
- if (stdio->done)
- return -1;
-
- spin_lock(&buf->lock);
- int ret = min(len, buf->buf.nr);
- buf->buf.nr -= ret;
- memcpy(ubuf, buf->buf.data, ret);
- memmove(buf->buf.data,
- buf->buf.data + ret,
- buf->buf.nr);
- spin_unlock(&buf->lock);
-
- wake_up(&buf->wait);
- return ret;
-}
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
- darray_char *line,
- unsigned long timeout)
-{
- unsigned long until = jiffies + timeout, t;
- struct stdio_buf *buf = &stdio->input;
- size_t seen = 0;
-again:
- t = timeout != MAX_SCHEDULE_TIMEOUT
- ? max_t(long, until - jiffies, 0)
- : timeout;
-
- t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
-
- wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
-
- if (stdio->done)
- return -1;
-
- spin_lock(&buf->lock);
- seen = buf->buf.nr;
- char *n = memchr(buf->buf.data, '\n', seen);
-
- if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
- spin_unlock(&buf->lock);
- return -ETIME;
- }
-
- if (!n) {
- buf->waiting_for_line = true;
- spin_unlock(&buf->lock);
- goto again;
- }
-
- size_t b = n + 1 - buf->buf.data;
- if (b > line->size) {
- spin_unlock(&buf->lock);
- int ret = darray_resize(line, b);
- if (ret)
- return ret;
- seen = 0;
- goto again;
- }
-
- buf->buf.nr -= b;
- memcpy(line->data, buf->buf.data, b);
- memmove(buf->buf.data,
- buf->buf.data + b,
- buf->buf.nr);
- line->nr = b;
-
- buf->waiting_for_line = false;
- spin_unlock(&buf->lock);
-
- wake_up(&buf->wait);
- return 0;
-}
-
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
-{
- return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
-}
-
-__printf(3, 0)
-static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
-{
- ssize_t ret;
-
- do {
- va_list args2;
- size_t len;
-
- va_copy(args2, args);
- len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
- va_end(args2);
-
- if (len + 1 <= darray_room(*out)) {
- out->nr += len;
- return len;
- }
-
- ret = darray_make_room_gfp(out, len + 1, gfp);
- } while (ret == 0);
-
- return ret;
-}
-
-ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
- const char *fmt, va_list args)
-{
- struct stdio_buf *buf = &stdio->output;
- unsigned long flags;
- ssize_t ret;
-again:
- if (stdio->done)
- return -EPIPE;
-
- spin_lock_irqsave(&buf->lock, flags);
- ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
- spin_unlock_irqrestore(&buf->lock, flags);
-
- if (ret < 0) {
- if (nonblocking)
- return -EAGAIN;
-
- ret = wait_event_interruptible(buf->wait,
- stdio_redirect_has_output_space(stdio));
- if (ret)
- return ret;
- goto again;
- }
-
- wake_up(&buf->wait);
- return ret;
-}
-
-ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
- const char *fmt, ...)
-{
- va_list args;
- ssize_t ret;
-
- va_start(args, fmt);
- ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
- va_end(args);
-
- return ret;
-}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
deleted file mode 100644
index 72497b921911..000000000000
--- a/fs/bcachefs/thread_with_file.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_H
-#define _BCACHEFS_THREAD_WITH_FILE_H
-
-#include "thread_with_file_types.h"
-
-/*
- * Thread with file: Run a kthread and connect it to a file descriptor, so that
- * it can be interacted with via fd read/write methods and closing the file
- * descriptor stops the kthread.
- *
- * We have two different APIs:
- *
- * thread_with_file, the low level version.
- * You get to define the full file_operations, including your release function,
- * which means that you must call bch2_thread_with_file_exit() from your
- * .release method
- *
- * thread_with_stdio, the higher level version
- * This implements full piping of input and output, including .poll.
- *
- * Notes on behaviour:
- * - kthread shutdown behaves like writing or reading from a pipe that has been
- * closed
- * - Input and output buffers are 4096 bytes, although buffers may in some
- * situations slightly exceed that limit so as to avoid chopping off a
- * message in the middle in nonblocking mode.
- * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
- * should be fine but might change in future revisions.
- * - Output buffer may grow past 4096 bytes to deal with messages that are
- * bigger than 4096 bytes
- * - Writing may be done blocking or nonblocking; in nonblocking mode, we only
- * drop entire messages.
- *
- * To write, use stdio_redirect_printf()
- * To read, use stdio_redirect_read() or stdio_redirect_readline()
- */
-
-struct task_struct;
-
-struct thread_with_file {
- struct task_struct *task;
- int ret;
- bool done;
-};
-
-void bch2_thread_with_file_exit(struct thread_with_file *);
-int bch2_run_thread_with_file(struct thread_with_file *,
- const struct file_operations *,
- int (*fn)(void *));
-
-struct thread_with_stdio;
-
-struct thread_with_stdio_ops {
- void (*exit)(struct thread_with_stdio *);
- int (*fn)(struct thread_with_stdio *);
- long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
-};
-
-struct thread_with_stdio {
- struct thread_with_file thr;
- struct stdio_redirect stdio;
- const struct thread_with_stdio_ops *ops;
-};
-
-void bch2_thread_with_stdio_init(struct thread_with_stdio *,
- const struct thread_with_stdio_ops *);
-int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
-int bch2_run_thread_with_stdio(struct thread_with_stdio *,
- const struct thread_with_stdio_ops *);
-int bch2_run_thread_with_stdout(struct thread_with_stdio *,
- const struct thread_with_stdio_ops *);
-int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-
-int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
-
-__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
-__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
deleted file mode 100644
index f4d484d44f63..000000000000
--- a/fs/bcachefs/thread_with_file_types.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
-
-#include "darray.h"
-
-struct stdio_buf {
- spinlock_t lock;
- wait_queue_head_t wait;
- darray_char buf;
- bool waiting_for_line;
-};
-
-struct stdio_redirect {
- struct stdio_buf input;
- struct stdio_buf output;
- bool done;
-};
-
-#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
deleted file mode 100644
index 2c34fe4be912..000000000000
--- a/fs/bcachefs/time_stats.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-
-#include "eytzinger.h"
-#include "time_stats.h"
-
-/* disable automatic switching to percpu mode */
-#define TIME_STATS_NONPCPU ((unsigned long) 1)
-
-static const struct time_unit time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "d", (u64) NSEC_PER_SEC * 3600 * 24},
- { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
- { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
- { "eon", U64_MAX },
-};
-
-const struct time_unit *bch2_pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-static void quantiles_update(struct quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
- bool initted = stats->last_event != 0;
-
- if (time_after64(end, start)) {
- struct quantiles *quantiles = time_stats_to_quantiles(stats);
-
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted,
- duration, initted, TIME_STATS_MV_WEIGHT);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
-
- if (quantiles)
- quantiles_update(quantiles, duration);
- }
-
- if (stats->last_event && time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted,
- freq, initted, TIME_STATS_MV_WEIGHT);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- }
-
- stats->last_event = end;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct time_stat_buffer *b)
-{
- for (struct time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) {
- spin_lock_irqsave(&stats->lock, flags);
- time_stats_update_one(stats, start, end);
-
- if (!stats->buffer &&
- mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *stats)
-{
- spin_lock_irq(&stats->lock);
- unsigned offset = offsetof(struct bch2_time_stats, min_duration);
- memset((void *) stats + offset, 0, sizeof(*stats) - offset);
-
- if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) {
- int cpu;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(stats->buffer, cpu)->nr = 0;
- }
- spin_unlock_irq(&stats->lock);
-}
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU)
- free_percpu(stats->buffer);
- stats->buffer = NULL;
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
-
-void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
-{
- bch2_time_stats_init(stats);
- stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU;
-}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
deleted file mode 100644
index eddb0985bab4..000000000000
--- a/fs/bcachefs/time_stats.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * bch2_time_stats - collect statistics on events that have a duration, with nicely
- * formatted textual output on demand
- *
- * - percpu buffering of event collection: cheap enough to shotgun
- * everywhere without worrying about overhead
- *
- * tracks:
- * - number of events
- * - maximum event duration ever seen
- * - sum of all event durations
- * - average event duration, standard and weighted
- * - standard deviation of event durations, standard and weighted
- * and analagous statistics for the frequency of events
- *
- * We provide both mean and weighted mean (exponentially weighted), and standard
- * deviation and weighted standard deviation, to give an efficient-to-compute
- * view of current behaviour versus. average behaviour - "did this event source
- * just become wonky, or is this typical?".
- *
- * Particularly useful for tracking down latency issues.
- */
-#ifndef _BCACHEFS_TIME_STATS_H
-#define _BCACHEFS_TIME_STATS_H
-
-#include <linux/sched/clock.h>
-#include <linux/spinlock_types.h>
-#include <linux/string.h>
-
-#include "mean_and_variance.h"
-
-struct time_unit {
- const char *name;
- u64 nsecs;
-};
-
-/*
- * given a nanosecond value, pick the preferred time units for printing:
- */
-const struct time_unit *bch2_pick_time_units(u64 ns);
-
-/*
- * quantiles - do not use:
- *
- * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
- * use in new code.
- */
-
-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct quantiles {
- struct quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct time_stat_buffer {
- unsigned nr;
- struct time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[31];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- bool have_quantiles;
- struct time_stat_buffer __percpu *buffer;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- u64 last_event_start;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance freq_stats;
-
-/* default weight for weighted mean and variance calculations */
-#define TIME_STATS_MV_WEIGHT 8
-
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance_weighted freq_stats_weighted;
-};
-
-struct bch2_time_stats_quantiles {
- struct bch2_time_stats stats;
- struct quantiles quantiles;
-};
-
-static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
-{
- return stats->have_quantiles
- ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
- : NULL;
-}
-
-void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-/**
- * time_stats_update - collect a new event being tracked
- *
- * @stats - bch2_time_stats to update
- * @start - start time of event, recorded with local_clock()
- *
- * The end duration of the event will be the current time
- */
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-/**
- * track_event_change - track state change events
- *
- * @stats - bch2_time_stats to update
- * @v - new state, true or false
- *
- * Use this when tracking time stats for state changes, i.e. resource X becoming
- * blocked/unblocked.
- */
-static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
-{
- if (v != !!stats->last_event_start) {
- if (!v) {
- bch2_time_stats_update(stats, stats->last_event_start);
- stats->last_event_start = 0;
- } else {
- stats->last_event_start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-
-void bch2_time_stats_reset(struct bch2_time_stats *);
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
-
-static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
-{
- bch2_time_stats_exit(&statq->stats);
-}
-static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
-{
- bch2_time_stats_init(&statq->stats);
- statq->stats.have_quantiles = true;
- memset(&statq->quantiles, 0, sizeof(statq->quantiles));
-}
-
-#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
deleted file mode 100644
index dfad1d06633d..000000000000
--- a/fs/bcachefs/trace.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "bcachefs.h"
-#include "alloc_types.h"
-#include "buckets.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_update_interior.h"
-#include "keylist.h"
-#include "move_types.h"
-#include "opts.h"
-#include "six.h"
-
-#include <linux/blktrace_api.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
deleted file mode 100644
index 9c5a9c551f03..000000000000
--- a/fs/bcachefs/trace.h
+++ /dev/null
@@ -1,1883 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bcachefs
-
-#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-
-#include <linux/tracepoint.h>
-
-#define TRACE_BPOS_entries(name) \
- __field(u64, name##_inode ) \
- __field(u64, name##_offset ) \
- __field(u32, name##_snapshot )
-
-#define TRACE_BPOS_assign(dst, src) \
- __entry->dst##_inode = (src).inode; \
- __entry->dst##_offset = (src).offset; \
- __entry->dst##_snapshot = (src).snapshot
-
-DECLARE_EVENT_CLASS(bpos,
- TP_PROTO(const struct bpos *p),
- TP_ARGS(p),
-
- TP_STRUCT__entry(
- TRACE_BPOS_entries(p)
- ),
-
- TP_fast_assign(
- TRACE_BPOS_assign(p, *p);
- ),
-
- TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
-);
-
-DECLARE_EVENT_CLASS(fs_str,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(str, str )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(str);
- ),
-
- TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
- TP_ARGS(trans, caller_ip, str),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __string(str, str )
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __assign_str(str);
- ),
-
- TP_printk("%d,%d %s %pS %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(trans_str_nocaller,
- TP_PROTO(struct btree_trans *trans, const char *str),
- TP_ARGS(trans, str),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- __string(str, str )
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(str);
- ),
-
- TP_printk("%d,%d %s %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->trans_fn, __get_str(str))
-);
-
-DECLARE_EVENT_CLASS(btree_node_nofs,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u8, level )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->level = b->c.level;
- __entry->btree_id = b->c.btree_id;
- TRACE_BPOS_assign(pos, b->key.k.p);
- ),
-
- TP_printk("%d,%d %u %s %llu:%llu:%u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->level,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_node,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- __field(u8, level )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->level = b->c.level;
- __entry->btree_id = b->c.btree_id;
- TRACE_BPOS_assign(pos, b->key.k.p);
- ),
-
- TP_printk("%d,%d %s %u %s %llu:%llu:%u",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
- __entry->level,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(bch_fs,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- ),
-
- TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
-);
-
-DECLARE_EVENT_CLASS(btree_trans,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, trans_fn, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = trans->c->dev;
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- ),
-
- TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
-);
-
-DECLARE_EVENT_CLASS(bio,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(sector_t, sector )
- __field(unsigned int, nr_sector )
- __array(char, rwbs, 6 )
- ),
-
- TP_fast_assign(
- __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0;
- __entry->sector = bio->bi_iter.bi_sector;
- __entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
- ),
-
- TP_printk("%d,%d %s %llu + %u",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
- (unsigned long long)__entry->sector, __entry->nr_sector)
-);
-
-/* errors */
-
-TRACE_EVENT(error_throw,
- TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
- TP_ARGS(c, bch_err, ip),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(int, err )
- __array(char, err_str, 32 )
- __array(char, ip, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->err = bch_err;
- strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
- ),
-
- TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ip, __entry->err_str)
-);
-
-TRACE_EVENT(error_downcast,
- TP_PROTO(int bch_err, int std_err, unsigned long ip),
- TP_ARGS(bch_err, std_err, ip),
-
- TP_STRUCT__entry(
- __array(char, bch_err, 32 )
- __array(char, std_err, 32 )
- __array(char, ip, 32 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
- strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
- ),
-
- TP_printk("%s ret %s -> %s %s", __entry->ip,
- __entry->bch_err, __entry->std_err, __entry->ip)
-);
-
-/* disk_accounting.c */
-
-TRACE_EVENT(accounting_mem_insert,
- TP_PROTO(struct bch_fs *c, const char *acc),
- TP_ARGS(c, acc),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(unsigned, new_nr )
- __string(acc, acc )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->new_nr = c->accounting.k.nr;
- __assign_str(acc);
- ),
-
- TP_printk("%d,%d entries %u added %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->new_nr,
- __get_str(acc))
-);
-
-/* fs.c: */
-TRACE_EVENT(bch2_sync_fs,
- TP_PROTO(struct super_block *sb, int wait),
-
- TP_ARGS(sb, wait),
-
- TP_STRUCT__entry(
- __field( dev_t, dev )
- __field( int, wait )
-
- ),
-
- TP_fast_assign(
- __entry->dev = sb->s_dev;
- __entry->wait = wait;
- ),
-
- TP_printk("dev %d,%d wait %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->wait)
-);
-
-/* fs-io.c: */
-TRACE_EVENT(bch2_fsync,
- TP_PROTO(struct file *file, int datasync),
-
- TP_ARGS(file, datasync),
-
- TP_STRUCT__entry(
- __field( dev_t, dev )
- __field( ino_t, ino )
- __field( ino_t, parent )
- __field( int, datasync )
- ),
-
- TP_fast_assign(
- struct dentry *dentry = file->f_path.dentry;
-
- __entry->dev = dentry->d_sb->s_dev;
- __entry->ino = d_inode(dentry)->i_ino;
- __entry->parent = d_inode(dentry->d_parent)->i_ino;
- __entry->datasync = datasync;
- ),
-
- TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long) __entry->ino,
- (unsigned long) __entry->parent, __entry->datasync)
-);
-
-/* super-io.c: */
-TRACE_EVENT(write_super,
- TP_PROTO(struct bch_fs *c, unsigned long ip),
- TP_ARGS(c, ip),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(unsigned long, ip )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->ip = ip;
- ),
-
- TP_printk("%d,%d for %pS",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (void *) __entry->ip)
-);
-
-/* io.c: */
-
-DEFINE_EVENT(bio, io_read_promote,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-TRACE_EVENT(io_read_nopromote,
- TP_PROTO(struct bch_fs *c, int ret),
- TP_ARGS(c, ret),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __array(char, ret, 32 )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
- ),
-
- TP_printk("%d,%d ret %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ret)
-);
-
-DEFINE_EVENT(bio, io_read_bounce,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_split,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_retry,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_reuse_race,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-DEFINE_EVENT(bio, io_read_fail_and_poison,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-/* ec.c */
-
-TRACE_EVENT(stripe_create,
- TP_PROTO(struct bch_fs *c, u64 idx, int ret),
- TP_ARGS(c, idx, ret),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, idx )
- __field(int, ret )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->idx = idx;
- __entry->ret = ret;
- ),
-
- TP_printk("%d,%d idx %llu ret %i",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->idx,
- __entry->ret)
-);
-
-/* Journal */
-
-DEFINE_EVENT(bch_fs, journal_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_full,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, journal_entry_close,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(bio, journal_write,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-TRACE_EVENT(journal_reclaim_start,
- TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
- u64 min_nr, u64 min_key_cache,
- u64 btree_cache_dirty, u64 btree_cache_total,
- u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
- btree_cache_dirty, btree_cache_total,
- btree_key_cache_dirty, btree_key_cache_total),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(bool, direct )
- __field(bool, kicked )
- __field(u64, min_nr )
- __field(u64, min_key_cache )
- __field(u64, btree_cache_dirty )
- __field(u64, btree_cache_total )
- __field(u64, btree_key_cache_dirty )
- __field(u64, btree_key_cache_total )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->direct = direct;
- __entry->kicked = kicked;
- __entry->min_nr = min_nr;
- __entry->min_key_cache = min_key_cache;
- __entry->btree_cache_dirty = btree_cache_dirty;
- __entry->btree_cache_total = btree_cache_total;
- __entry->btree_key_cache_dirty = btree_key_cache_dirty;
- __entry->btree_key_cache_total = btree_key_cache_total;
- ),
-
- TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->direct,
- __entry->kicked,
- __entry->min_nr,
- __entry->min_key_cache,
- __entry->btree_cache_dirty,
- __entry->btree_cache_total,
- __entry->btree_key_cache_dirty,
- __entry->btree_key_cache_total)
-);
-
-TRACE_EVENT(journal_reclaim_finish,
- TP_PROTO(struct bch_fs *c, u64 nr_flushed),
- TP_ARGS(c, nr_flushed),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, nr_flushed )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->nr_flushed = nr_flushed;
- ),
-
- TP_printk("%d,%d flushed %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->nr_flushed)
-);
-
-/* bset.c: */
-
-DEFINE_EVENT(bpos, bkey_pack_pos_fail,
- TP_PROTO(const struct bpos *p),
- TP_ARGS(p)
-);
-
-/* Btree cache: */
-
-TRACE_EVENT(btree_cache_scan,
- TP_PROTO(long nr_to_scan, long can_free, long ret),
- TP_ARGS(nr_to_scan, can_free, ret),
-
- TP_STRUCT__entry(
- __field(long, nr_to_scan )
- __field(long, can_free )
- __field(long, ret )
- ),
-
- TP_fast_assign(
- __entry->nr_to_scan = nr_to_scan;
- __entry->can_free = can_free;
- __entry->ret = ret;
- ),
-
- TP_printk("scanned for %li nodes, can free %li, ret %li",
- __entry->nr_to_scan, __entry->can_free, __entry->ret)
-);
-
-DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans)
-);
-
-/* Btree */
-
-DEFINE_EVENT(btree_node, btree_node_read,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_node_write,
- TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
- TP_ARGS(b, bytes, sectors),
-
- TP_STRUCT__entry(
- __field(enum btree_node_type, type)
- __field(unsigned, bytes )
- __field(unsigned, sectors )
- ),
-
- TP_fast_assign(
- __entry->type = btree_node_type(b);
- __entry->bytes = bytes;
- __entry->sectors = sectors;
- ),
-
- TP_printk("bkey type %u bytes %u sectors %u",
- __entry->type , __entry->bytes, __entry->sectors)
-);
-
-DEFINE_EVENT(btree_node, btree_node_alloc,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_free,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_reserve_get_fail,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- size_t required,
- int ret),
- TP_ARGS(trans_fn, caller_ip, required, ret),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(size_t, required )
- __array(char, ret, 32 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->required = required;
- strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
- ),
-
- TP_printk("%s %pS required %zu ret %s",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->required,
- __entry->ret)
-);
-
-DEFINE_EVENT(btree_node, btree_node_compact,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_merge,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_split,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_rewrite,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-DEFINE_EVENT(btree_node, btree_node_set_root,
- TP_PROTO(struct btree_trans *trans, struct btree *b),
- TP_ARGS(trans, b)
-);
-
-TRACE_EVENT(btree_path_relock_fail,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned level),
- TP_ARGS(trans, caller_ip, path, level),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, level )
- __field(u8, path_idx)
- TRACE_BPOS_entries(pos)
- __array(char, node, 24 )
- __field(u8, self_read_count )
- __field(u8, self_intent_count)
- __field(u8, read_count )
- __field(u8, intent_count )
- __field(u32, iter_lock_seq )
- __field(u32, node_lock_seq )
- ),
-
- TP_fast_assign(
- struct btree *b = btree_path_node(path, level);
- struct six_lock_count c;
-
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->level = level;
- __entry->path_idx = path - trans->paths;
- TRACE_BPOS_assign(pos, path->pos);
-
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
- __entry->self_read_count = c.n[SIX_LOCK_read];
- __entry->self_intent_count = c.n[SIX_LOCK_intent];
-
- if (IS_ERR(b)) {
- strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
- } else {
- c = six_lock_counts(&path->l[level].b->c.lock);
- __entry->read_count = c.n[SIX_LOCK_read];
- __entry->intent_count = c.n[SIX_LOCK_intent];
- scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
- }
- __entry->iter_lock_seq = path->l[level].lock_seq;
- __entry->node_lock_seq = is_btree_node(path, level)
- ? six_lock_seq(&path->l[level].b->c.lock)
- : 0;
- ),
-
- TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->path_idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level,
- __entry->node,
- __entry->self_read_count,
- __entry->self_intent_count,
- __entry->read_count,
- __entry->intent_count,
- __entry->iter_lock_seq,
- __entry->node_lock_seq)
-);
-
-TRACE_EVENT(btree_path_upgrade_fail,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned level),
- TP_ARGS(trans, caller_ip, path, level),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, level )
- __field(u8, path_idx)
- TRACE_BPOS_entries(pos)
- __field(u8, locked )
- __field(u8, self_read_count )
- __field(u8, self_intent_count)
- __field(u8, read_count )
- __field(u8, intent_count )
- __field(u32, iter_lock_seq )
- __field(u32, node_lock_seq )
- ),
-
- TP_fast_assign(
- struct six_lock_count c;
-
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->level = level;
- __entry->path_idx = path - trans->paths;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->locked = btree_node_locked(path, level);
-
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
- __entry->self_read_count = c.n[SIX_LOCK_read];
- __entry->self_intent_count = c.n[SIX_LOCK_intent];
- c = six_lock_counts(&path->l[level].b->c.lock);
- __entry->read_count = c.n[SIX_LOCK_read];
- __entry->intent_count = c.n[SIX_LOCK_intent];
- __entry->iter_lock_seq = path->l[level].lock_seq;
- __entry->node_lock_seq = is_btree_node(path, level)
- ? six_lock_seq(&path->l[level].b->c.lock)
- : 0;
- ),
-
- TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->path_idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level,
- __entry->locked,
- __entry->self_read_count,
- __entry->self_intent_count,
- __entry->read_count,
- __entry->intent_count,
- __entry->iter_lock_seq,
- __entry->node_lock_seq)
-);
-
-/* Garbage collection */
-
-DEFINE_EVENT(bch_fs, gc_gens_start,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_gens_end,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-/* Allocator */
-
-DEFINE_EVENT(fs_str, bucket_alloc,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, bucket_alloc_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DECLARE_EVENT_CLASS(discard_buckets_class,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, seen )
- __field(u64, open )
- __field(u64, need_journal_commit )
- __field(u64, discarded )
- __array(char, err, 16 )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->seen = seen;
- __entry->open = open;
- __entry->need_journal_commit = need_journal_commit;
- __entry->discarded = discarded;
- strscpy(__entry->err, err, sizeof(__entry->err));
- ),
-
- TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->seen,
- __entry->open,
- __entry->need_journal_commit,
- __entry->discarded,
- __entry->err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
-);
-
-TRACE_EVENT(bucket_invalidate,
- TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
- TP_ARGS(c, dev, bucket, sectors),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u32, dev_idx )
- __field(u32, sectors )
- __field(u64, bucket )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->dev_idx = dev;
- __entry->sectors = sectors;
- __entry->bucket = bucket;
- ),
-
- TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->dev_idx, __entry->bucket,
- __entry->sectors)
-);
-
-/* Moving IO */
-
-DEFINE_EVENT(fs_str, io_move,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_read,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_finish,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_write_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_start_fail,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-TRACE_EVENT(move_data,
- TP_PROTO(struct bch_fs *c,
- struct bch_move_stats *stats),
- TP_ARGS(c, stats),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, keys_moved )
- __field(u64, keys_raced )
- __field(u64, sectors_seen )
- __field(u64, sectors_moved )
- __field(u64, sectors_raced )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->keys_moved = atomic64_read(&stats->keys_moved);
- __entry->keys_raced = atomic64_read(&stats->keys_raced);
- __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
- __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
- __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
- ),
-
- TP_printk("%d,%d keys moved %llu raced %llu"
- "sectors seen %llu moved %llu raced %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->keys_moved,
- __entry->keys_raced,
- __entry->sectors_seen,
- __entry->sectors_moved,
- __entry->sectors_raced)
-);
-
-TRACE_EVENT(copygc,
- TP_PROTO(struct bch_fs *c,
- u64 buckets,
- u64 sectors_seen,
- u64 sectors_moved),
- TP_ARGS(c, buckets, sectors_seen, sectors_moved),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, buckets )
- __field(u64, sectors_seen )
- __field(u64, sectors_moved )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->buckets = buckets;
- __entry->sectors_seen = sectors_seen;
- __entry->sectors_moved = sectors_moved;
- ),
-
- TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->buckets,
- __entry->sectors_seen,
- __entry->sectors_moved)
-);
-
-TRACE_EVENT(copygc_wait,
- TP_PROTO(struct bch_fs *c,
- u64 wait_amount, u64 until),
- TP_ARGS(c, wait_amount, until),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, wait_amount )
- __field(u64, until )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->wait_amount = wait_amount;
- __entry->until = until;
- ),
-
- TP_printk("%d,%u waiting for %llu sectors until %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->wait_amount, __entry->until)
-);
-
-/* btree transactions: */
-
-DECLARE_EVENT_CLASS(transaction_event,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- ),
-
- TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, transaction_commit,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_injected,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_split_race,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree *b),
- TP_ARGS(trans, caller_ip, b),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, level )
- __field(u16, written )
- __field(u16, blocks )
- __field(u16, u64s_remaining )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->level = b->c.level;
- __entry->written = b->written;
- __entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
- ),
-
- TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
- __entry->trans_fn, (void *) __entry->caller_ip,
- __entry->level,
- __entry->written, __entry->blocks,
- __entry->u64s_remaining)
-);
-
-TRACE_EVENT(trans_blocked_journal_reclaim,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
-
- __field(unsigned long, key_cache_nr_keys )
- __field(unsigned long, key_cache_nr_dirty )
- __field(long, must_wait )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys);
- __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
- __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c);
- ),
-
- TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
- __entry->trans_fn, (void *) __entry->caller_ip,
- __entry->key_cache_nr_keys,
- __entry->key_cache_nr_dirty,
- __entry->must_wait)
-);
-
-#if 0
-/* todo: bring back dynamic fault injection */
-DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-#endif
-
-DEFINE_EVENT(transaction_event, trans_traverse_all,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- const char *paths),
- TP_ARGS(trans, caller_ip, paths)
-);
-
-DECLARE_EVENT_CLASS(transaction_restart_iter,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos)
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(fs_str, trans_restart_upgrade,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(trans_str, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
- TP_ARGS(trans, caller_ip, str)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
- TP_PROTO(struct btree_trans *trans,
- const char *cycle),
- TP_ARGS(trans, cycle)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(trans_restart_would_deadlock_write,
- TP_PROTO(struct btree_trans *trans),
- TP_ARGS(trans),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- ),
-
- TP_printk("%s", __entry->trans_fn)
-);
-
-TRACE_EVENT(trans_restart_mem_realloced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- unsigned long bytes),
- TP_ARGS(trans, caller_ip, bytes),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned long, bytes )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->bytes = bytes;
- ),
-
- TP_printk("%s %pS bytes %lu",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->bytes)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
-TRACE_EVENT(path_downgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_locks_want),
- TP_ARGS(trans, caller_ip, path, old_locks_want),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned, old_locks_want )
- __field(unsigned, new_locks_want )
- __field(unsigned, btree )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->old_locks_want = old_locks_want;
- __entry->new_locks_want = path->locks_want;
- __entry->btree = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->old_locks_want,
- __entry->new_locks_want,
- bch2_btree_id_str(__entry->btree),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-TRACE_EVENT(key_cache_fill,
- TP_PROTO(struct btree_trans *trans, const char *key),
- TP_ARGS(trans, key),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __string(key, key )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(key);
- ),
-
- TP_printk("%s %s", __entry->trans_fn, __get_str(key))
-);
-
-TRACE_EVENT(write_buffer_flush,
- TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
- TP_ARGS(trans, nr, skipped, fast, size),
-
- TP_STRUCT__entry(
- __field(size_t, nr )
- __field(size_t, skipped )
- __field(size_t, fast )
- __field(size_t, size )
- ),
-
- TP_fast_assign(
- __entry->nr = nr;
- __entry->skipped = skipped;
- __entry->fast = fast;
- __entry->size = size;
- ),
-
- TP_printk("%zu/%zu skipped %zu fast %zu",
- __entry->nr, __entry->size, __entry->skipped, __entry->fast)
-);
-
-TRACE_EVENT(write_buffer_flush_sync,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
- TP_ARGS(trans, caller_ip),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- ),
-
- TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
-);
-
-TRACE_EVENT(write_buffer_flush_slowpath,
- TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
- TP_ARGS(trans, slowpath, total),
-
- TP_STRUCT__entry(
- __field(size_t, slowpath )
- __field(size_t, total )
- ),
-
- TP_fast_assign(
- __entry->slowpath = slowpath;
- __entry->total = total;
- ),
-
- TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
-);
-
-TRACE_EVENT(write_buffer_maybe_flush,
- TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key),
- TP_ARGS(trans, caller_ip, key),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __string(key, key )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __assign_str(key);
- ),
-
- TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key))
-);
-
-DEFINE_EVENT(fs_str, rebalance_extent,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, data_update,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_pred,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, extent_trim_atomic,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_slot,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, __btree_iter_peek,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_max,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
- TP_PROTO(struct bch_fs *c, const char *str),
- TP_ARGS(c, str)
-);
-
-#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
-
-TRACE_EVENT(update_by_path,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path,
- struct btree_insert_entry *i, bool overwrite),
- TP_ARGS(trans, path, i, overwrite),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(btree_path_idx_t, path_idx )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- __field(u8, overwrite )
- __field(btree_path_idx_t, update_idx )
- __field(btree_path_idx_t, nr_updates )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->path_idx = path - trans->paths;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->overwrite = overwrite;
- __entry->update_idx = i - trans->updates;
- __entry->nr_updates = trans->nr_updates;
- ),
-
- TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
- __entry->trans_fn,
- __entry->path_idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->overwrite,
- __entry->update_idx,
- __entry->nr_updates)
-);
-
-TRACE_EVENT(btree_path_lock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_bkey_cached_common *b),
- TP_ARGS(trans, caller_ip, b),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, level )
- __array(char, node, 24 )
- __field(u32, lock_seq )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = b->btree_id;
- __entry->level = b->level;
-
- scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
- __entry->lock_seq = six_lock_seq(&b->lock);
- ),
-
- TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->level,
- __entry->node,
- __entry->lock_seq)
-);
-
-DECLARE_EVENT_CLASS(btree_path_ev,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path),
-
- TP_STRUCT__entry(
- __field(u16, idx )
- __field(u8, ref )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
- __entry->idx, __entry->ref,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_alloc,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, locks_want )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->locks_want = path->locks_want;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
- __entry->idx,
- bch2_btree_id_str(__entry->btree_id),
- __entry->locks_want,
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot)
-);
-
-TRACE_EVENT(btree_path_get,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
- TP_ARGS(trans, path, new_pos),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, ref )
- __field(u8, preserve )
- __field(u8, locks_want )
- __field(u8, btree_id )
- TRACE_BPOS_entries(old_pos)
- TRACE_BPOS_entries(new_pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- __entry->locks_want = path->locks_want;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(old_pos, path->pos);
- TRACE_BPOS_assign(new_pos, *new_pos);
- ),
-
- TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->locks_want,
- __entry->old_pos_inode,
- __entry->old_pos_offset,
- __entry->old_pos_snapshot,
- __entry->new_pos_inode,
- __entry->new_pos_offset,
- __entry->new_pos_snapshot)
-);
-
-DECLARE_EVENT_CLASS(btree_path_clone,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
- TP_ARGS(trans, path, new),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, new_idx )
- __field(u8, btree_id )
- __field(u8, ref )
- __field(u8, preserve )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->new_idx = new - trans->paths;
- __entry->btree_id = path->btree_id;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- TRACE_BPOS_assign(pos, path->pos);
- ),
-
- TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->new_idx)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_clone,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
- TP_ARGS(trans, path, new)
-);
-
-DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
- TP_ARGS(trans, path, new)
-);
-
-DECLARE_EVENT_CLASS(btree_path_traverse,
- TP_PROTO(struct btree_trans *trans,
- struct btree_path *path),
- TP_ARGS(trans, path),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(btree_path_idx_t, idx )
- __field(u8, ref )
- __field(u8, preserve )
- __field(u8, should_be_locked )
- __field(u8, btree_id )
- __field(u8, level )
- TRACE_BPOS_entries(pos)
- __field(u8, locks_want )
- __field(u8, nodes_locked )
- __array(char, node0, 24 )
- __array(char, node1, 24 )
- __array(char, node2, 24 )
- __array(char, node3, 24 )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- __entry->btree_id = path->btree_id;
- __entry->level = path->level;
- TRACE_BPOS_assign(pos, path->pos);
-
- __entry->locks_want = path->locks_want;
- __entry->nodes_locked = path->nodes_locked;
- struct btree *b = path->l[0].b;
- if (IS_ERR(b))
- strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[1].b;
- if (IS_ERR(b))
- strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[2].b;
- if (IS_ERR(b))
- strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[3].b;
- if (IS_ERR(b))
- strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
- ),
-
- TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
- "locks %u %u %u %u node %s %s %s %s",
- __entry->trans_fn,
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level,
- __entry->locks_want,
- (__entry->nodes_locked >> 6) & 3,
- (__entry->nodes_locked >> 4) & 3,
- (__entry->nodes_locked >> 2) & 3,
- (__entry->nodes_locked >> 0) & 3,
- __entry->node3,
- __entry->node2,
- __entry->node1,
- __entry->node0)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
- TP_PROTO(struct btree_trans *trans,
- struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
- TP_PROTO(struct btree_trans *trans, struct btree_path *path),
- TP_ARGS(trans, path)
-);
-
-TRACE_EVENT(btree_path_set_pos,
- TP_PROTO(struct btree_trans *trans,
- struct btree_path *path,
- struct bpos *new_pos),
- TP_ARGS(trans, path, new_pos),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, ref )
- __field(u8, preserve )
- __field(u8, btree_id )
- TRACE_BPOS_entries(old_pos)
- TRACE_BPOS_entries(new_pos)
- __field(u8, locks_want )
- __field(u8, nodes_locked )
- __array(char, node0, 24 )
- __array(char, node1, 24 )
- __array(char, node2, 24 )
- __array(char, node3, 24 )
- ),
-
- TP_fast_assign(
- __entry->idx = path - trans->paths;
- __entry->ref = path->ref;
- __entry->preserve = path->preserve;
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(old_pos, path->pos);
- TRACE_BPOS_assign(new_pos, *new_pos);
-
- __entry->nodes_locked = path->nodes_locked;
- struct btree *b = path->l[0].b;
- if (IS_ERR(b))
- strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[1].b;
- if (IS_ERR(b))
- strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[2].b;
- if (IS_ERR(b))
- strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
- b = path->l[3].b;
- if (IS_ERR(b))
- strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
- else
- scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
- ),
-
- TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
- "locks %u %u %u %u node %s %s %s %s",
- __entry->idx,
- __entry->ref,
- __entry->preserve,
- bch2_btree_id_str(__entry->btree_id),
- __entry->old_pos_inode,
- __entry->old_pos_offset,
- __entry->old_pos_snapshot,
- __entry->new_pos_inode,
- __entry->new_pos_offset,
- __entry->new_pos_snapshot,
- (__entry->nodes_locked >> 6) & 3,
- (__entry->nodes_locked >> 4) & 3,
- (__entry->nodes_locked >> 2) & 3,
- (__entry->nodes_locked >> 0) & 3,
- __entry->node3,
- __entry->node2,
- __entry->node1,
- __entry->node0)
-);
-
-TRACE_EVENT(btree_path_free,
- TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
- TP_ARGS(trans, path, dup),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- __field(u8, preserve )
- __field(u8, should_be_locked)
- __field(s8, dup )
- __field(u8, dup_locked )
- ),
-
- TP_fast_assign(
- __entry->idx = path;
- __entry->preserve = trans->paths[path].preserve;
- __entry->should_be_locked = trans->paths[path].should_be_locked;
- __entry->dup = dup ? dup - trans->paths : -1;
- __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
- ),
-
- TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
- __entry->preserve ? 'P' : ' ',
- __entry->should_be_locked ? 'S' : ' ',
- __entry->dup,
- __entry->dup_locked)
-);
-
-#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-#ifndef _TRACE_BCACHEFS_H
-
-static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct btree_insert_entry *i, bool overwrite) {}
-static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
-static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
-static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
-static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
-static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
-
-#endif
-#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
-
-#define _TRACE_BCACHEFS_H
-#endif /* _TRACE_BCACHEFS_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../fs/bcachefs
-
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
deleted file mode 100644
index 9764c2e6a910..000000000000
--- a/fs/bcachefs/two_state_shared_lock.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "two_state_shared_lock.h"
-
-void __bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
- __wait_event(lock->wait, bch2_two_state_trylock(lock, s));
-}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
deleted file mode 100644
index 7f647846b511..000000000000
--- a/fs/bcachefs/two_state_shared_lock.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_TWO_STATE_LOCK_H
-#define _BCACHEFS_TWO_STATE_LOCK_H
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-
-#include "util.h"
-
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-typedef struct {
- atomic_long_t v;
- wait_queue_head_t wait;
-} two_state_lock_t;
-
-static inline void two_state_lock_init(two_state_lock_t *lock)
-{
- atomic_long_set(&lock->v, 0);
- init_waitqueue_head(&lock->wait);
-}
-
-static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
-{
- long i = s ? 1 : -1;
-
- EBUG_ON(atomic_long_read(&lock->v) == 0);
-
- if (atomic_long_sub_return_release(i, &lock->v) == 0)
- wake_up_all(&lock->wait);
-}
-
-static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
-{
- long i = s ? 1 : -1;
- long old;
-
- old = atomic_long_read(&lock->v);
- do {
- if (i > 0 ? old < 0 : old > 0)
- return false;
- } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
-
- return true;
-}
-
-void __bch2_two_state_lock(two_state_lock_t *, int);
-
-static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
-{
- if (!bch2_two_state_trylock(lock, s))
- __bch2_two_state_lock(lock, s);
-}
-
-#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
deleted file mode 100644
index df9a6071fe18..000000000000
--- a/fs/bcachefs/util.c
+++ /dev/null
@@ -1,1047 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * random utility code, for bcache but in theory not specific to bcache
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/log2.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/sched/clock.h>
-
-#include "eytzinger.h"
-#include "mean_and_variance.h"
-#include "util.h"
-
-static const char si_units[] = "?kMGTPEZY";
-
-/* string_get_size units: */
-static const char *const units_2[] = {
- "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
-};
-static const char *const units_10[] = {
- "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
-};
-
-static int parse_u64(const char *cp, u64 *res)
-{
- const char *start = cp;
- u64 v = 0;
-
- if (!isdigit(*cp))
- return -EINVAL;
-
- do {
- if (v > U64_MAX / 10)
- return -ERANGE;
- v *= 10;
- if (v > U64_MAX - (*cp - '0'))
- return -ERANGE;
- v += *cp - '0';
- cp++;
- } while (isdigit(*cp));
-
- *res = v;
- return cp - start;
-}
-
-static int bch2_pow(u64 n, u64 p, u64 *res)
-{
- *res = 1;
-
- while (p--) {
- if (*res > div64_u64(U64_MAX, n))
- return -ERANGE;
- *res *= n;
- }
- return 0;
-}
-
-static int parse_unit_suffix(const char *cp, u64 *res)
-{
- const char *start = cp;
- u64 base = 1024;
- unsigned u;
- int ret;
-
- if (*cp == ' ')
- cp++;
-
- for (u = 1; u < strlen(si_units); u++)
- if (*cp == si_units[u]) {
- cp++;
- goto got_unit;
- }
-
- for (u = 0; u < ARRAY_SIZE(units_2); u++)
- if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
- cp += strlen(units_2[u]);
- goto got_unit;
- }
-
- for (u = 0; u < ARRAY_SIZE(units_10); u++)
- if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
- cp += strlen(units_10[u]);
- base = 1000;
- goto got_unit;
- }
-
- *res = 1;
- return 0;
-got_unit:
- ret = bch2_pow(base, u, res);
- if (ret)
- return ret;
-
- return cp - start;
-}
-
-#define parse_or_ret(cp, _f) \
-do { \
- int _ret = _f; \
- if (_ret < 0) \
- return _ret; \
- cp += _ret; \
-} while (0)
-
-static int __bch2_strtou64_h(const char *cp, u64 *res)
-{
- const char *start = cp;
- u64 v = 0, b, f_n = 0, f_d = 1;
- int ret;
-
- parse_or_ret(cp, parse_u64(cp, &v));
-
- if (*cp == '.') {
- cp++;
- ret = parse_u64(cp, &f_n);
- if (ret < 0)
- return ret;
- cp += ret;
-
- ret = bch2_pow(10, ret, &f_d);
- if (ret)
- return ret;
- }
-
- parse_or_ret(cp, parse_unit_suffix(cp, &b));
-
- if (v > div64_u64(U64_MAX, b))
- return -ERANGE;
- v *= b;
-
- if (f_n > div64_u64(U64_MAX, b))
- return -ERANGE;
-
- f_n = div64_u64(f_n * b, f_d);
- if (v + f_n < v)
- return -ERANGE;
- v += f_n;
-
- *res = v;
- return cp - start;
-}
-
-static int __bch2_strtoh(const char *cp, u64 *res,
- u64 t_max, bool t_signed)
-{
- bool positive = *cp != '-';
- u64 v = 0;
-
- if (*cp == '+' || *cp == '-')
- cp++;
-
- parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
-
- if (*cp == '\n')
- cp++;
- if (*cp)
- return -EINVAL;
-
- if (positive) {
- if (v > t_max)
- return -ERANGE;
- } else {
- if (v && !t_signed)
- return -ERANGE;
-
- if (v > t_max + 1)
- return -ERANGE;
- v = -v;
- }
-
- *res = v;
- return 0;
-}
-
-#define STRTO_H(name, type) \
-int bch2_ ## name ## _h(const char *cp, type *res) \
-{ \
- u64 v = 0; \
- int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
- ANYSINT_MAX(type) != ((type) ~0ULL)); \
- *res = v; \
- return ret; \
-}
-
-STRTO_H(strtoint, int)
-STRTO_H(strtouint, unsigned int)
-STRTO_H(strtoll, long long)
-STRTO_H(strtoull, unsigned long long)
-STRTO_H(strtou64, u64)
-
-u64 bch2_read_flag_list(const char *opt, const char * const list[])
-{
- u64 ret = 0;
- char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
-
- if (!d)
- return -ENOMEM;
-
- s = strim(d);
-
- while ((p = strsep(&s, ",;"))) {
- int flag = match_string(list, -1, p);
-
- if (flag < 0) {
- ret = -1;
- break;
- }
-
- ret |= BIT_ULL(flag);
- }
-
- kfree(d);
-
- return ret;
-}
-
-bool bch2_is_zero(const void *_p, size_t n)
-{
- const char *p = _p;
- size_t i;
-
- for (i = 0; i < n; i++)
- if (p[i])
- return false;
- return true;
-}
-
-void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
-{
- while (nr_bits)
- prt_char(out, '0' + ((v >> --nr_bits) & 1));
-}
-
-void bch2_prt_u64_base2(struct printbuf *out, u64 v)
-{
- bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
-}
-
-static bool string_is_spaces(const char *str)
-{
- while (*str) {
- if (*str != ' ')
- return false;
- str++;
- }
- return true;
-}
-
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
-{
- bool locked = false;
- const char *p;
-
- if (!lines) {
- printk("%s (null)\n", prefix);
- return;
- }
-
- locked = console_trylock();
-
- while (*lines) {
- p = strchrnul(lines, '\n');
- if (!*p && string_is_spaces(lines))
- break;
-
- printk("%s%.*s\n", prefix, (int) (p - lines), lines);
- if (!*p)
- break;
- lines = p + 1;
- }
- if (locked)
- console_unlock();
-}
-
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
- gfp_t gfp)
-{
-#ifdef CONFIG_STACKTRACE
- unsigned nr_entries = 0;
-
- stack->nr = 0;
- int ret = darray_make_room_gfp(stack, 32, gfp);
- if (ret)
- return ret;
-
- if (!down_read_trylock(&task->signal->exec_update_lock))
- return -1;
-
- do {
- nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
- } while (nr_entries == stack->size &&
- !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
-
- stack->nr = nr_entries;
- up_read(&task->signal->exec_update_lock);
-
- return ret;
-#else
- return 0;
-#endif
-}
-
-void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
-{
- darray_for_each(*stack, i) {
- prt_printf(out, "[<0>] %pB", (void *) *i);
- prt_newline(out);
- }
-}
-
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
-{
- bch_stacktrace stack = { 0 };
- int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
-
- bch2_prt_backtrace(out, &stack);
- darray_exit(&stack);
- return ret;
-}
-
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- time_t t = sec;
- char buf[64];
- ctime_r(&t, buf);
- strim(buf);
- prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- char buf[64];
- snprintf(buf, sizeof(buf), "%ptT", &sec);
- prt_u64(out, sec);
-}
-#endif
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = bch2_pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = bch2_pick_time_units(ns);
-
- prt_printf(out, "%llu \r%s", div64_u64(ns, u->nsecs), u->name);
-}
-
-static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
-{
- prt_printf(out, "%s\t", name);
- bch2_pr_time_units_aligned(out, ns);
- prt_newline(out);
-}
-
-#define TABSTOP_SIZE 12
-
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
-{
- struct quantiles *quantiles = time_stats_to_quantiles(stats);
- s64 f_mean = 0, d_mean = 0;
- u64 f_stddev = 0, d_stddev = 0;
-
- if (stats->buffer) {
- int cpu;
-
- spin_lock_irq(&stats->lock);
- for_each_possible_cpu(cpu)
- __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
- spin_unlock_irq(&stats->lock);
- }
-
- /*
- * avoid divide by zero
- */
- if (stats->freq_stats.n) {
- f_mean = mean_and_variance_get_mean(stats->freq_stats);
- f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
- d_mean = mean_and_variance_get_mean(stats->duration_stats);
- d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
- }
-
- printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
- prt_printf(out, "count:\t%llu\n", stats->duration_stats.n);
- printbuf_tabstop_pop(out);
-
- printbuf_tabstops_reset(out);
-
- printbuf_tabstop_push(out, out->indent + 20);
- printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
- printbuf_tabstop_push(out, 0);
- printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
-
- prt_printf(out, "\tsince mount\r\trecent\r\n");
-
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, out->indent + 20);
- printbuf_tabstop_push(out, TABSTOP_SIZE);
- printbuf_tabstop_push(out, 2);
- printbuf_tabstop_push(out, TABSTOP_SIZE);
-
- prt_printf(out, "duration of events\n");
- printbuf_indent_add(out, 2);
-
- pr_name_and_units(out, "min:", stats->min_duration);
- pr_name_and_units(out, "max:", stats->max_duration);
- pr_name_and_units(out, "total:", stats->total_duration);
-
- prt_printf(out, "mean:\t");
- bch2_pr_time_units_aligned(out, d_mean);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
- prt_newline(out);
-
- prt_printf(out, "stddev:\t");
- bch2_pr_time_units_aligned(out, d_stddev);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
-
- printbuf_indent_sub(out, 2);
- prt_newline(out);
-
- prt_printf(out, "time between events\n");
- printbuf_indent_add(out, 2);
-
- pr_name_and_units(out, "min:", stats->min_freq);
- pr_name_and_units(out, "max:", stats->max_freq);
-
- prt_printf(out, "mean:\t");
- bch2_pr_time_units_aligned(out, f_mean);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
- prt_newline(out);
-
- prt_printf(out, "stddev:\t");
- bch2_pr_time_units_aligned(out, f_stddev);
- prt_tab(out);
- bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
-
- printbuf_indent_sub(out, 2);
- prt_newline(out);
-
- printbuf_tabstops_reset(out);
-
- if (quantiles) {
- int i = eytzinger0_first(NR_QUANTILES);
- const struct time_unit *u =
- bch2_pick_time_units(quantiles->entries[i].m);
- u64 last_q = 0;
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(j, NR_QUANTILES) {
- bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
-
- u64 q = max(quantiles->entries[j].m, last_q);
- prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
- }
- }
-}
-
-/* ratelimit: */
-
-/**
- * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- * @d: the struct bch_ratelimit to update
- * Returns: the amount of time to delay by, in jiffies
- */
-u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
-{
- u64 now = local_clock();
-
- return time_after64(d->next, now)
- ? nsecs_to_jiffies(d->next - now)
- : 0;
-}
-
-/**
- * bch2_ratelimit_increment() - increment @d by the amount of work done
- * @d: the struct bch_ratelimit to update
- * @done: the amount of work done, in arbitrary units
- */
-void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
-{
- u64 now = local_clock();
-
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
-
- if (time_before64(now + NSEC_PER_SEC, d->next))
- d->next = now + NSEC_PER_SEC;
-
- if (time_after64(now - NSEC_PER_SEC * 2, d->next))
- d->next = now - NSEC_PER_SEC * 2;
-}
-
-/* pd controller: */
-
-/*
- * Updates pd_controller. Attempts to scale inputed values to units per second.
- * @target: desired value
- * @actual: current value
- *
- * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
- * it makes actual go down.
- */
-void bch2_pd_controller_update(struct bch_pd_controller *pd,
- s64 target, s64 actual, int sign)
-{
- s64 proportional, derivative, change;
-
- unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
-
- if (seconds_since_update == 0)
- return;
-
- pd->last_update = jiffies;
-
- proportional = actual - target;
- proportional *= seconds_since_update;
- proportional = div_s64(proportional, pd->p_term_inverse);
-
- derivative = actual - pd->last_actual;
- derivative = div_s64(derivative, seconds_since_update);
- derivative = ewma_add(pd->smoothed_derivative, derivative,
- (pd->d_term / seconds_since_update) ?: 1);
- derivative = derivative * pd->d_term;
- derivative = div_s64(derivative, pd->p_term_inverse);
-
- change = proportional + derivative;
-
- /* Don't increase rate if not keeping up */
- if (change > 0 &&
- pd->backpressure &&
- time_after64(local_clock(),
- pd->rate.next + NSEC_PER_MSEC))
- change = 0;
-
- change *= (sign * -1);
-
- pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
- 1, UINT_MAX);
-
- pd->last_actual = actual;
- pd->last_derivative = derivative;
- pd->last_proportional = proportional;
- pd->last_change = change;
- pd->last_target = target;
-}
-
-void bch2_pd_controller_init(struct bch_pd_controller *pd)
-{
- pd->rate.rate = 1024;
- pd->last_update = jiffies;
- pd->p_term_inverse = 6000;
- pd->d_term = 30;
- pd->d_smooth = pd->d_term;
- pd->backpressure = 1;
-}
-
-void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
-{
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
-
- prt_printf(out, "rate:\t");
- prt_human_readable_s64(out, pd->rate.rate);
- prt_newline(out);
-
- prt_printf(out, "target:\t");
- prt_human_readable_u64(out, pd->last_target);
- prt_newline(out);
-
- prt_printf(out, "actual:\t");
- prt_human_readable_u64(out, pd->last_actual);
- prt_newline(out);
-
- prt_printf(out, "proportional:\t");
- prt_human_readable_s64(out, pd->last_proportional);
- prt_newline(out);
-
- prt_printf(out, "derivative:\t");
- prt_human_readable_s64(out, pd->last_derivative);
- prt_newline(out);
-
- prt_printf(out, "change:\t");
- prt_human_readable_s64(out, pd->last_change);
- prt_newline(out);
-
- prt_printf(out, "next io:\t%llims\n", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
-}
-
-/* misc: */
-
-void bch2_bio_map(struct bio *bio, void *base, size_t size)
-{
- while (size) {
- struct page *page = is_vmalloc_addr(base)
- ? vmalloc_to_page(base)
- : virt_to_page(base);
- unsigned offset = offset_in_page(base);
- unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
-
- BUG_ON(!bio_add_page(bio, page, len, offset));
- size -= len;
- base += len;
- }
-}
-
-int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
-{
- while (size) {
- struct page *page = alloc_pages(gfp_mask, 0);
- unsigned len = min_t(size_t, PAGE_SIZE, size);
-
- if (!page)
- return -ENOMEM;
-
- if (unlikely(!bio_add_page(bio, page, len, 0))) {
- __free_page(page);
- break;
- }
-
- size -= len;
- }
-
- return 0;
-}
-
-u64 bch2_get_random_u64_below(u64 ceil)
-{
- if (ceil <= U32_MAX)
- return __get_random_u32_below(ceil);
-
- /* this is the same (clever) algorithm as in __get_random_u32_below() */
- u64 rand = get_random_u64();
- u64 mult = ceil * rand;
-
- if (unlikely(mult < ceil)) {
- u64 bound;
- div64_u64_rem(-ceil, ceil, &bound);
- while (unlikely(mult < bound)) {
- rand = get_random_u64();
- mult = ceil * rand;
- }
- }
-
- return mul_u64_u64_shr(ceil, rand, 64);
-}
-
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- __bio_for_each_segment(bv, dst, iter, dst_iter) {
- void *dstp = kmap_local_page(bv.bv_page);
-
- memcpy(dstp + bv.bv_offset, src, bv.bv_len);
- kunmap_local(dstp);
-
- src += bv.bv_len;
- }
-}
-
-void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- __bio_for_each_segment(bv, src, iter, src_iter) {
- void *srcp = kmap_local_page(bv.bv_page);
-
- memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
- kunmap_local(srcp);
-
- dst += bv.bv_len;
- }
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *bio)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
-
- bio_for_each_segment(bv, bio, iter) {
- unsigned u64s = bv.bv_len / sizeof(u64);
-
- if (offset < u64s) {
- u64 *segment = bvec_kmap_local(&bv);
- segment[offset] = get_random_u64();
- kunmap_local(segment);
- return;
- }
- offset -= u64s;
- }
-}
-#endif
-
-void bch2_bio_to_text(struct printbuf *out, struct bio *bio)
-{
- prt_printf(out, "bi_remaining:\t%u\n",
- atomic_read(&bio->__bi_remaining));
- prt_printf(out, "bi_end_io:\t%ps\n",
- bio->bi_end_io);
- prt_printf(out, "bi_status:\t%u\n",
- bio->bi_status);
-}
-
-#if 0
-void eytzinger1_test(void)
-{
- unsigned inorder, size;
-
- pr_info("1 based eytzinger test:\n");
-
- for (size = 2;
- size < 65536;
- size++) {
- unsigned extra = eytzinger1_extra(size);
-
- if (!(size % 4096))
- pr_info("tree size %u\n", size);
-
- inorder = 1;
- eytzinger1_for_each(eytz, size) {
- BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
- BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
- BUG_ON(eytz != eytzinger1_last(size) &&
- eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
-
- inorder++;
- }
- BUG_ON(inorder - 1 != size);
- }
-}
-
-void eytzinger0_test(void)
-{
-
- unsigned inorder, size;
-
- pr_info("0 based eytzinger test:\n");
-
- for (size = 1;
- size < 65536;
- size++) {
- unsigned extra = eytzinger0_extra(size);
-
- if (!(size % 4096))
- pr_info("tree size %u\n", size);
-
- inorder = 0;
- eytzinger0_for_each(eytz, size) {
- BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
- BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
- BUG_ON(eytz != eytzinger0_last(size) &&
- eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
-
- inorder++;
- }
- BUG_ON(inorder != size);
-
- inorder = size - 1;
- eytzinger0_for_each_prev(eytz, size) {
- BUG_ON(eytz != eytzinger0_first(size) &&
- eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
-
- inorder--;
- }
- BUG_ON(inorder != -1);
- }
-}
-
-static inline int cmp_u16(const void *_l, const void *_r)
-{
- const u16 *l = _l, *r = _r;
-
- return (*l > *r) - (*r > *l);
-}
-
-static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
-{
- int r, s;
- bool bad;
-
- r = eytzinger0_find_le(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- if (r >= 0) {
- if (test_array[r] > search) {
- bad = true;
- } else {
- s = eytzinger0_next(r, nr);
- bad = s >= 0 && test_array[s] <= search;
- }
- } else {
- s = eytzinger0_last(nr);
- bad = s >= 0 && test_array[s] <= search;
- }
-
- if (bad) {
- s = -1;
- eytzinger0_for_each_prev(j, nr) {
- if (test_array[j] <= search) {
- s = j;
- break;
- }
- }
-
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find_le(%12u) = %3i should be %3i\n",
- search, r, s);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
-{
- int r, s;
- bool bad;
-
- r = eytzinger0_find_gt(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- if (r >= 0) {
- if (test_array[r] <= search) {
- bad = true;
- } else {
- s = eytzinger0_prev(r, nr);
- bad = s >= 0 && test_array[s] > search;
- }
- } else {
- s = eytzinger0_first(nr);
- bad = s >= 0 && test_array[s] > search;
- }
-
- if (bad) {
- s = -1;
- eytzinger0_for_each(j, nr) {
- if (test_array[j] > search) {
- s = j;
- break;
- }
- }
-
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find_gt(%12u) = %3i should be %3i\n",
- search, r, s);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
-{
- int r, s;
- bool bad;
-
- r = eytzinger0_find_ge(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- if (r >= 0) {
- if (test_array[r] < search) {
- bad = true;
- } else {
- s = eytzinger0_prev(r, nr);
- bad = s >= 0 && test_array[s] >= search;
- }
- } else {
- s = eytzinger0_first(nr);
- bad = s >= 0 && test_array[s] >= search;
- }
-
- if (bad) {
- s = -1;
- eytzinger0_for_each(j, nr) {
- if (test_array[j] >= search) {
- s = j;
- break;
- }
- }
-
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find_ge(%12u) = %3i should be %3i\n",
- search, r, s);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
-{
- unsigned r;
- int s;
- bool bad;
-
- r = eytzinger0_find(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
-
- if (r < nr) {
- bad = test_array[r] != search;
- } else {
- s = eytzinger0_find_le(test_array, nr,
- sizeof(test_array[0]),
- cmp_u16, &search);
- bad = s >= 0 && test_array[s] == search;
- }
-
- if (bad) {
- eytzinger0_for_each(j, nr)
- pr_info("[%3u] = %12u\n", j, test_array[j]);
- pr_info("find(%12u) = %3i is incorrect\n",
- search, r);
- BUG();
- }
-}
-
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
-{
- eytzinger0_find_test_le(test_array, nr, search);
- eytzinger0_find_test_gt(test_array, nr, search);
- eytzinger0_find_test_ge(test_array, nr, search);
- eytzinger0_find_test_eq(test_array, nr, search);
-}
-
-void eytzinger0_find_test(void)
-{
- unsigned i, nr, allocated = 1 << 12;
- u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
-
- for (nr = 1; nr < allocated; nr++) {
- u16 prev = 0;
-
- pr_info("testing %u elems\n", nr);
-
- get_random_bytes(test_array, nr * sizeof(test_array[0]));
- eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
-
- /* verify array is sorted correctly: */
- eytzinger0_for_each(j, nr) {
- BUG_ON(test_array[j] < prev);
- prev = test_array[j];
- }
-
- for (i = 0; i < U16_MAX; i += 1 << 12)
- eytzinger0_find_test_val(test_array, nr, i);
-
- for (i = 0; i < nr; i++) {
- eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
- eytzinger0_find_test_val(test_array, nr, test_array[i]);
- eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
- }
- }
-
- kfree(test_array);
-}
-#endif
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
- u64 *ret;
- int cpu;
-
- /* access to pcpu vars has to be blocked by other locking */
- preempt_disable();
- ret = this_cpu_ptr(p);
- preempt_enable();
-
- for_each_possible_cpu(cpu) {
- u64 *i = per_cpu_ptr(p, cpu);
-
- if (i != ret) {
- acc_u64s(ret, i, nr);
- memset(i, 0, nr * sizeof(u64));
- }
- }
-
- return ret;
-}
-
-void bch2_darray_str_exit(darray_const_str *d)
-{
- darray_for_each(*d, i)
- kfree(*i);
- darray_exit(d);
-}
-
-int bch2_split_devs(const char *_dev_name, darray_const_str *ret)
-{
- darray_init(ret);
-
- char *dev_name, *s, *orig;
-
- dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return -ENOMEM;
-
- while ((s = strsep(&dev_name, ":"))) {
- char *p = kstrdup(s, GFP_KERNEL);
- if (!p)
- goto err;
-
- if (darray_push(ret, p)) {
- kfree(p);
- goto err;
- }
- }
-
- kfree(orig);
- return 0;
-err:
- bch2_darray_str_exit(ret);
- kfree(orig);
- return -ENOMEM;
-}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
deleted file mode 100644
index 6488f098d140..000000000000
--- a/fs/bcachefs/util.h
+++ /dev/null
@@ -1,782 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_UTIL_H
-#define _BCACHEFS_UTIL_H
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/closure.h>
-#include <linux/errno.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/min_heap.h>
-#include <linux/sched/clock.h>
-#include <linux/llist.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/random.h>
-#include <linux/ratelimit.h>
-#include <linux/slab.h>
-#include <linux/sort.h>
-#include <linux/vmalloc.h>
-#include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
-
-#include "darray.h"
-#include "time_stats.h"
-
-struct closure;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define EBUG_ON(cond) BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define CPU_BIG_ENDIAN 0
-#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define CPU_BIG_ENDIAN 1
-#endif
-
-/* type hackery */
-
-#define type_is_exact(_val, _type) \
- __builtin_types_compatible_p(typeof(_val), _type)
-
-#define type_is(_val, _type) \
- (__builtin_types_compatible_p(typeof(_val), _type) || \
- __builtin_types_compatible_p(typeof(_val), const _type))
-
-/* Userspace doesn't align allocations as nicely as the kernel allocators: */
-static inline size_t buf_pages(void *p, size_t len)
-{
- return DIV_ROUND_UP(len +
- ((unsigned long) p & (PAGE_SIZE - 1)),
- PAGE_SIZE);
-}
-
-static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
-{
- void *p = unlikely(n >= INT_MAX)
- ? vmalloc_noprof(n)
- : kvmalloc_noprof(n, flags & ~__GFP_ZERO);
- if (p && (flags & __GFP_ZERO))
- memset(p, 0, n);
- return p;
-}
-#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__))
-
-#define init_heap(heap, _size, gfp) \
-({ \
- (heap)->nr = 0; \
- (heap)->size = (_size); \
- (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
- (gfp)); \
-})
-
-#define free_heap(heap) \
-do { \
- kvfree((heap)->data); \
- (heap)->data = NULL; \
-} while (0)
-
-#define ANYSINT_MAX(t) \
- ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-
-#include "printbuf.h"
-
-#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__)
-#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__)
-#define printbuf_str(_buf) bch2_printbuf_str(_buf)
-#define printbuf_exit(_buf) bch2_printbuf_exit(_buf)
-
-#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf)
-#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf)
-#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n)
-
-#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n)
-#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n)
-#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n)
-
-#define prt_newline(_out) bch2_prt_newline(_out)
-#define prt_tab(_out) bch2_prt_tab(_out)
-#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out)
-
-#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__)
-#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v))
-#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__)
-#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__)
-#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__)
-#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__)
-#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__)
-#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
-#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__)
-
-void bch2_pr_time_units(struct printbuf *, u64);
-void bch2_prt_datetime(struct printbuf *, time64_t);
-
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
- sprintf(out, "%pUb", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
-static inline void pr_uuid(struct printbuf *out, u8 *uuid)
-{
- char uuid_str[40];
-
- uuid_unparse_lower(uuid, uuid_str);
- prt_printf(out, "%s", uuid_str);
-}
-
-int bch2_strtoint_h(const char *, int *);
-int bch2_strtouint_h(const char *, unsigned int *);
-int bch2_strtoll_h(const char *, long long *);
-int bch2_strtoull_h(const char *, unsigned long long *);
-int bch2_strtou64_h(const char *, u64 *);
-
-static inline int bch2_strtol_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
- return bch2_strtoint_h(cp, (int *) res);
-#else
- return bch2_strtoll_h(cp, (long long *) res);
-#endif
-}
-
-static inline int bch2_strtoul_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
- return bch2_strtouint_h(cp, (unsigned int *) res);
-#else
- return bch2_strtoull_h(cp, (unsigned long long *) res);
-#endif
-}
-
-#define strtoi_h(cp, res) \
- ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\
- : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\
- : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\
- : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\
- : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\
- : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
- : -EINVAL)
-
-#define strtoul_safe(cp, var) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r) \
- var = _v; \
- _r; \
-})
-
-#define strtoul_safe_clamp(cp, var, min, max) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r) \
- var = clamp_t(typeof(var), _v, min, max); \
- _r; \
-})
-
-#define strtoul_safe_restrict(cp, var, min, max) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r && _v >= min && _v <= max) \
- var = _v; \
- else \
- _r = -EINVAL; \
- _r; \
-})
-
-#define snprint(out, var) \
- prt_printf(out, \
- type_is(var, int) ? "%i\n" \
- : type_is(var, unsigned) ? "%u\n" \
- : type_is(var, long) ? "%li\n" \
- : type_is(var, unsigned long) ? "%lu\n" \
- : type_is(var, s64) ? "%lli\n" \
- : type_is(var, u64) ? "%llu\n" \
- : type_is(var, char *) ? "%s\n" \
- : "%i\n", var)
-
-bool bch2_is_zero(const void *, size_t);
-
-u64 bch2_read_flag_list(const char *, const char * const[]);
-
-void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
-void bch2_prt_u64_base2(struct printbuf *, u64);
-
-void bch2_print_string_as_lines(const char *, const char *);
-
-typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
-void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
-
-static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
-{
-#ifdef __KERNEL__
- prt_printf(out, "%pg", bdev);
-#else
- prt_str(out, bdev->name);
-#endif
-}
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-#define ewma_add(ewma, val, weight) \
-({ \
- typeof(ewma) _ewma = (ewma); \
- typeof(weight) _weight = (weight); \
- \
- (((_ewma << _weight) - _ewma) + (val)) >> _weight; \
-})
-
-struct bch_ratelimit {
- /* Next time we want to do some work, in nanoseconds */
- u64 next;
-
- /*
- * Rate at which we want to do work, in units per nanosecond
- * The units here correspond to the units passed to
- * bch2_ratelimit_increment()
- */
- unsigned rate;
-};
-
-static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
-{
- d->next = local_clock();
-}
-
-u64 bch2_ratelimit_delay(struct bch_ratelimit *);
-void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
-
-struct bch_pd_controller {
- struct bch_ratelimit rate;
- unsigned long last_update;
-
- s64 last_actual;
- s64 smoothed_derivative;
-
- unsigned p_term_inverse;
- unsigned d_smooth;
- unsigned d_term;
-
- /* for exporting to sysfs (no effect on behavior) */
- s64 last_derivative;
- s64 last_proportional;
- s64 last_change;
- s64 last_target;
-
- /*
- * If true, the rate will not increase if bch2_ratelimit_delay()
- * is not being called often enough.
- */
- bool backpressure;
-};
-
-void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
-void bch2_pd_controller_init(struct bch_pd_controller *);
-void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
-
-#define sysfs_pd_controller_attribute(name) \
- rw_attribute(name##_rate); \
- rw_attribute(name##_rate_bytes); \
- rw_attribute(name##_rate_d_term); \
- rw_attribute(name##_rate_p_term_inverse); \
- read_attribute(name##_rate_debug)
-
-#define sysfs_pd_controller_files(name) \
- &sysfs_##name##_rate, \
- &sysfs_##name##_rate_bytes, \
- &sysfs_##name##_rate_d_term, \
- &sysfs_##name##_rate_p_term_inverse, \
- &sysfs_##name##_rate_debug
-
-#define sysfs_pd_controller_show(name, var) \
-do { \
- sysfs_hprint(name##_rate, (var)->rate.rate); \
- sysfs_print(name##_rate_bytes, (var)->rate.rate); \
- sysfs_print(name##_rate_d_term, (var)->d_term); \
- sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
- \
- if (attr == &sysfs_##name##_rate_debug) \
- bch2_pd_controller_debug_to_text(out, var); \
-} while (0)
-
-#define sysfs_pd_controller_store(name, var) \
-do { \
- sysfs_strtoul_clamp(name##_rate, \
- (var)->rate.rate, 1, UINT_MAX); \
- sysfs_strtoul_clamp(name##_rate_bytes, \
- (var)->rate.rate, 1, UINT_MAX); \
- sysfs_strtoul(name##_rate_d_term, (var)->d_term); \
- sysfs_strtoul_clamp(name##_rate_p_term_inverse, \
- (var)->p_term_inverse, 1, INT_MAX); \
-} while (0)
-
-#define container_of_or_null(ptr, type, member) \
-({ \
- typeof(ptr) _ptr = ptr; \
- _ptr ? container_of(_ptr, type, member) : NULL; \
-})
-
-static inline struct list_head *list_pop(struct list_head *head)
-{
- if (list_empty(head))
- return NULL;
-
- struct list_head *ret = head->next;
- list_del_init(ret);
- return ret;
-}
-
-#define list_pop_entry(head, type, member) \
- container_of_or_null(list_pop(head), type, member)
-
-/* Does linear interpolation between powers of two */
-static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
-{
- unsigned fract = x & ~(~0 << fract_bits);
-
- x >>= fract_bits;
- x = 1 << x;
- x += (x * fract) >> fract_bits;
-
- return x;
-}
-
-void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
-
-#define closure_bio_submit(bio, cl) \
-do { \
- closure_get(cl); \
- submit_bio(bio); \
-} while (0)
-
-#define kthread_wait(cond) \
-({ \
- int _ret = 0; \
- \
- while (1) { \
- set_current_state(TASK_INTERRUPTIBLE); \
- if (kthread_should_stop()) { \
- _ret = -1; \
- break; \
- } \
- \
- if (cond) \
- break; \
- \
- schedule(); \
- } \
- set_current_state(TASK_RUNNING); \
- _ret; \
-})
-
-#define kthread_wait_freezable(cond) \
-({ \
- int _ret = 0; \
- while (1) { \
- set_current_state(TASK_INTERRUPTIBLE); \
- if (kthread_should_stop()) { \
- _ret = -1; \
- break; \
- } \
- \
- if (cond) \
- break; \
- \
- schedule(); \
- try_to_freeze(); \
- } \
- set_current_state(TASK_RUNNING); \
- _ret; \
-})
-
-u64 bch2_get_random_u64_below(u64);
-
-void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
-void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_corrupt_bio(struct bio *);
-
-static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
-{
- if (ratio && !get_random_u32_below(ratio))
- bch2_corrupt_bio(bio);
-}
-#else
-#define bch2_maybe_corrupt_bio(...) do {} while (0)
-#endif
-
-void bch2_bio_to_text(struct printbuf *, struct bio *);
-
-static inline void memcpy_u64s_small(void *dst, const void *src,
- unsigned u64s)
-{
- u64 *d = dst;
- const u64 *s = src;
-
- while (u64s--)
- *d++ = *s++;
-}
-
-static inline void __memcpy_u64s(void *dst, const void *src,
- unsigned u64s)
-{
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
- long d0, d1, d2;
-
- asm volatile("rep ; movsq"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (u64s), "1" (dst), "2" (src)
- : "memory");
-#else
- u64 *d = dst;
- const u64 *s = src;
-
- while (u64s--)
- *d++ = *s++;
-#endif
-}
-
-static inline void memcpy_u64s(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
- dst + u64s * sizeof(u64) <= src));
-
- __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down(void *dst, const void *src,
- unsigned u64s)
-{
- __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst > src);
-
- __memmove_u64s_down(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down_small(void *dst, const void *src,
- unsigned u64s)
-{
- memcpy_u64s_small(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down_small(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst > src);
-
- __memmove_u64s_down_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
- unsigned u64s)
-{
- u64 *dst = (u64 *) _dst + u64s;
- u64 *src = (u64 *) _src + u64s;
-
- while (u64s--)
- *--dst = *--src;
-}
-
-static inline void memmove_u64s_up_small(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst < src);
-
- __memmove_u64s_up_small(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up(void *_dst, const void *_src,
- unsigned u64s)
-{
- u64 *dst = (u64 *) _dst + u64s - 1;
- u64 *src = (u64 *) _src + u64s - 1;
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
- long d0, d1, d2;
-
- asm volatile("std ;\n"
- "rep ; movsq\n"
- "cld ;\n"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (u64s), "1" (dst), "2" (src)
- : "memory");
-#else
- while (u64s--)
- *dst-- = *src--;
-#endif
-}
-
-static inline void memmove_u64s_up(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst < src);
-
- __memmove_u64s_up(dst, src, u64s);
-}
-
-static inline void memmove_u64s(void *dst, const void *src,
- unsigned u64s)
-{
- if (dst < src)
- __memmove_u64s_down(dst, src, u64s);
- else
- __memmove_u64s_up(dst, src, u64s);
-}
-
-/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
-static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
-{
- unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
-
- memset(s + bytes, c, rem);
-}
-
-/* just the memmove, doesn't update @_nr */
-#define __array_insert_item(_array, _nr, _pos) \
- memmove(&(_array)[(_pos) + 1], \
- &(_array)[(_pos)], \
- sizeof((_array)[0]) * ((_nr) - (_pos)))
-
-#define array_insert_item(_array, _nr, _pos, _new_item) \
-do { \
- __array_insert_item(_array, _nr, _pos); \
- (_nr)++; \
- (_array)[(_pos)] = (_new_item); \
-} while (0)
-
-#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
-do { \
- (_nr) -= (_nr_to_remove); \
- memmove(&(_array)[(_pos)], \
- &(_array)[(_pos) + (_nr_to_remove)], \
- sizeof((_array)[0]) * ((_nr) - (_pos))); \
-} while (0)
-
-#define array_remove_item(_array, _nr, _pos) \
- array_remove_items(_array, _nr, _pos, 1)
-
-static inline void __move_gap(void *array, size_t element_size,
- size_t nr, size_t size,
- size_t old_gap, size_t new_gap)
-{
- size_t gap_end = old_gap + size - nr;
-
- if (new_gap < old_gap) {
- size_t move = old_gap - new_gap;
-
- memmove(array + element_size * (gap_end - move),
- array + element_size * (old_gap - move),
- element_size * move);
- } else if (new_gap > old_gap) {
- size_t move = new_gap - old_gap;
-
- memmove(array + element_size * old_gap,
- array + element_size * gap_end,
- element_size * move);
- }
-}
-
-/* Move the gap in a gap buffer: */
-#define move_gap(_d, _new_gap) \
-do { \
- BUG_ON(_new_gap > (_d)->nr); \
- BUG_ON((_d)->gap > (_d)->nr); \
- \
- __move_gap((_d)->data, sizeof((_d)->data[0]), \
- (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
- (_d)->gap = _new_gap; \
-} while (0)
-
-#define bubble_sort(_base, _nr, _cmp) \
-do { \
- ssize_t _i, _last; \
- bool _swapped = true; \
- \
- for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
- _swapped = false; \
- for (_i = 0; _i < _last; _i++) \
- if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
- swap((_base)[_i], (_base)[_i + 1]); \
- _swapped = true; \
- } \
- } \
-} while (0)
-
-#define per_cpu_sum(_p) \
-({ \
- TYPEOF_UNQUAL(*_p) _ret = 0; \
- \
- int cpu; \
- for_each_possible_cpu(cpu) \
- _ret += *per_cpu_ptr(_p, cpu); \
- _ret; \
-})
-
-static inline u64 percpu_u64_get(u64 __percpu *src)
-{
- return per_cpu_sum(src);
-}
-
-static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(dst, cpu) = 0;
- this_cpu_write(*dst, src);
-}
-
-static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
-{
- for (unsigned i = 0; i < nr; i++)
- acc[i] += src[i];
-}
-
-static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
- unsigned nr)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
-}
-
-static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- memset(per_cpu_ptr(p, cpu), c, bytes);
-}
-
-u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-
-static inline int u8_cmp(u8 l, u8 r)
-{
- return cmp_int(l, r);
-}
-
-static inline int cmp_le32(__le32 l, __le32 r)
-{
- return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-#include <linux/uuid.h>
-
-static inline bool qstr_eq(const struct qstr l, const struct qstr r)
-{
- return l.len == r.len && !memcmp(l.name, r.name, l.len);
-}
-
-void bch2_darray_str_exit(darray_const_str *);
-int bch2_split_devs(const char *, darray_const_str *);
-
-#ifdef __KERNEL__
-
-__must_check
-static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
- return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
-__must_check
-static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
-{
- return copy_from_user(to, from, n) ? -EFAULT : 0;
-}
-
-#endif
-
-static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
-{
- if (v)
- set_bit(nr, addr);
- else
- clear_bit(nr, addr);
-}
-
-static inline void __set_bit_le64(size_t bit, __le64 *addr)
-{
- addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline void __clear_bit_le64(size_t bit, __le64 *addr)
-{
- addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
-}
-
-static inline bool test_bit_le64(size_t bit, __le64 *addr)
-{
- return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
-}
-
-static inline void memcpy_swab(void *_dst, void *_src, size_t len)
-{
- u8 *dst = _dst + len;
- u8 *src = _src;
-
- while (len--)
- *--dst = *src++;
-}
-
-#define set_flags(_map, _in, _out) \
-do { \
- unsigned _i; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & (1 << _i)) \
- (_out) |= _map[_i]; \
- else \
- (_out) &= ~_map[_i]; \
-} while (0)
-
-#define map_flags(_map, _in) \
-({ \
- unsigned _out = 0; \
- \
- set_flags(_map, _in, _out); \
- _out; \
-})
-
-#define map_flags_rev(_map, _in) \
-({ \
- unsigned _i, _out = 0; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & _map[_i]) { \
- (_out) |= 1 << _i; \
- (_in) &= ~_map[_i]; \
- } \
- (_out); \
-})
-
-#define map_defined(_map) \
-({ \
- unsigned _in = ~0; \
- \
- map_flags_rev(_map, _in); \
-})
-
-#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
deleted file mode 100644
index 6620ecae26af..000000000000
--- a/fs/bcachefs/varint.c
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/bitops.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-#ifdef CONFIG_VALGRIND
-#include <valgrind/memcheck.h>
-#endif
-
-#include "errcode.h"
-#include "varint.h"
-
-/**
- * bch2_varint_encode - encode a variable length integer
- * @out: destination to encode to
- * @v: unsigned integer to encode
- * Returns: size in bytes of the encoded integer - at most 9 bytes
- */
-int bch2_varint_encode(u8 *out, u64 v)
-{
- unsigned bits = fls64(v|1);
- unsigned bytes = DIV_ROUND_UP(bits, 7);
- __le64 v_le;
-
- if (likely(bytes < 9)) {
- v <<= bytes;
- v |= ~(~0 << (bytes - 1));
- v_le = cpu_to_le64(v);
- memcpy(out, &v_le, bytes);
- } else {
- *out++ = 255;
- bytes = 9;
- put_unaligned_le64(v, out);
- }
-
- return bytes;
-}
-
-/**
- * bch2_varint_decode - encode a variable length integer
- * @in: varint to decode
- * @end: end of buffer to decode from
- * @out: on success, decoded integer
- * Returns: size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- */
-int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
-{
- unsigned bytes = likely(in < end)
- ? ffz(*in & 255) + 1
- : 1;
- u64 v;
-
- if (unlikely(in + bytes > end))
- return -BCH_ERR_varint_decode_error;
-
- if (likely(bytes < 9)) {
- __le64 v_le = 0;
-
- memcpy(&v_le, in, bytes);
- v = le64_to_cpu(v_le);
- v >>= bytes;
- } else {
- v = get_unaligned_le64(++in);
- }
-
- *out = v;
- return bytes;
-}
-
-/**
- * bch2_varint_encode_fast - fast version of bch2_varint_encode
- * @out: destination to encode to
- * @v: unsigned integer to encode
- * Returns: size in bytes of the encoded integer - at most 9 bytes
- *
- * This version assumes it's always safe to write 8 bytes to @out, even if the
- * encoded integer would be smaller.
- */
-int bch2_varint_encode_fast(u8 *out, u64 v)
-{
- unsigned bits = fls64(v|1);
- unsigned bytes = DIV_ROUND_UP(bits, 7);
-
- if (likely(bytes < 9)) {
- v <<= bytes;
- v |= ~(~0U << (bytes - 1));
- } else {
- *out++ = 255;
- bytes = 9;
- }
-
- put_unaligned_le64(v, out);
- return bytes;
-}
-
-/**
- * bch2_varint_decode_fast - fast version of bch2_varint_decode
- * @in: varint to decode
- * @end: end of buffer to decode from
- * @out: on success, decoded integer
- * Returns: size in bytes of the decoded integer - or -1 on failure (would
- * have read past the end of the buffer)
- *
- * This version assumes that it is safe to read at most 8 bytes past the end of
- * @end (we still return an error if the varint extends past @end).
- */
-int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
-{
-#ifdef CONFIG_VALGRIND
- VALGRIND_MAKE_MEM_DEFINED(in, 8);
-#endif
- u64 v = get_unaligned_le64(in);
- unsigned bytes = ffz(*in) + 1;
-
- if (unlikely(in + bytes > end))
- return -BCH_ERR_varint_decode_error;
-
- if (likely(bytes < 9)) {
- v >>= bytes;
- v &= ~(~0ULL << (7 * bytes));
- } else {
- v = get_unaligned_le64(++in);
- }
-
- *out = v;
- return bytes;
-}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
deleted file mode 100644
index 92a182fb3d7a..000000000000
--- a/fs/bcachefs/varint.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_VARINT_H
-#define _BCACHEFS_VARINT_H
-
-int bch2_varint_encode(u8 *, u64);
-int bch2_varint_decode(const u8 *, const u8 *, u64 *);
-
-int bch2_varint_encode_fast(u8 *, u64);
-int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
-
-#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
deleted file mode 100644
index 2ad338e282da..000000000000
--- a/fs/bcachefs/vstructs.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _VSTRUCTS_H
-#define _VSTRUCTS_H
-
-#include "util.h"
-
-/*
- * NOTE: we can't differentiate between __le64 and u64 with type_is - this
- * assumes u64 is little endian:
- */
-#define __vstruct_u64s(_s) \
-({ \
- ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
- : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
- : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
- : ((__force u8) ((_s)->u64s))); \
-})
-
-#define __vstruct_bytes(_type, _u64s) \
-({ \
- BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
- \
- (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
-})
-
-#define vstruct_bytes(_s) \
- __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
-
-#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \
- (round_up(__vstruct_bytes(_type, _u64s), \
- 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
-
-#define vstruct_blocks(_s, _sector_block_bits) \
- __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
-
-#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \
- __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \
- __vstruct_u64s(_s) + (_u64s))
-
-#define vstruct_sectors(_s, _sector_block_bits) \
- (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
-
-#define vstruct_next(_s) \
- ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_last(_s) \
- ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_end(_s) \
- ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
-
-#define vstruct_for_each(_s, _i) \
- for (typeof(&(_s)->start[0]) _i = (_s)->start; \
- _i < vstruct_last(_s); \
- _i = vstruct_next(_i))
-
-#define vstruct_for_each_safe(_s, _i) \
- for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \
- _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \
- _i = _next)
-
-#define vstruct_idx(_s, _idx) \
- ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
-
-#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
deleted file mode 100644
index 627f153798c6..000000000000
--- a/fs/bcachefs/xattr.c
+++ /dev/null
@@ -1,642 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "acl.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "fs.h"
-#include "rebalance.h"
-#include "str_hash.h"
-#include "xattr.h"
-
-#include <linux/dcache.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
-
-static u64 bch2_xattr_hash(const struct bch_hash_info *info,
- const struct xattr_search_key *key)
-{
- struct bch_str_hash_ctx ctx;
-
- bch2_str_hash_init(&ctx, info);
- bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
- bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
-
- return bch2_str_hash_end(&ctx, info);
-}
-
-static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
-{
- return bch2_xattr_hash(info, key);
-}
-
-static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
- struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
-
- return bch2_xattr_hash(info,
- &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
-}
-
-static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
-{
- struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
- const struct xattr_search_key *r = _r;
-
- return l.v->x_type != r->type ||
- l.v->x_name_len != r->name.len ||
- memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
-}
-
-static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
- struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
- struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
-
- return l.v->x_type != r.v->x_type ||
- l.v->x_name_len != r.v->x_name_len ||
- memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
-}
-
-const struct bch_hash_desc bch2_xattr_hash_desc = {
- .btree_id = BTREE_ID_xattrs,
- .key_type = KEY_TYPE_xattr,
- .hash_key = xattr_hash_key,
- .hash_bkey = xattr_hash_bkey,
- .cmp_key = xattr_cmp_key,
- .cmp_bkey = xattr_cmp_bkey,
-};
-
-int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
- struct bkey_validate_context from)
-{
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len));
- int ret = 0;
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
- c, xattr_val_size_too_small,
- "value too small (%zu < %u)",
- bkey_val_u64s(k.k), val_u64s);
-
- /* XXX why +4 ? */
- val_u64s = xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4);
-
- bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
- c, xattr_val_size_too_big,
- "value too big (%zu > %u)",
- bkey_val_u64s(k.k), val_u64s);
-
- bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
- c, xattr_invalid_type,
- "invalid type (%u)", xattr.v->x_type);
-
- bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
- c, xattr_name_invalid_chars,
- "xattr name has invalid characters");
-fsck_err:
- return ret;
-}
-
-void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- const struct xattr_handler *handler;
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
-
- handler = bch2_xattr_type_to_handler(xattr.v->x_type);
- if (handler && handler->prefix)
- prt_printf(out, "%s", handler->prefix);
- else if (handler)
- prt_printf(out, "(type %u)", xattr.v->x_type);
- else
- prt_printf(out, "(unknown type %u)", xattr.v->x_type);
-
- unsigned name_len = xattr.v->x_name_len;
- unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
- unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
- offsetof(struct bch_xattr, x_name_and_value);
-
- val_len = min_t(int, val_len, max_name_val_bytes - name_len);
- name_len = min(name_len, max_name_val_bytes);
-
- prt_printf(out, "%.*s:%.*s",
- name_len, xattr.v->x_name_and_value,
- val_len, (char *) xattr_val(xattr.v));
-
- if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
- xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
- prt_char(out, ' ');
- bch2_acl_to_text(out, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
- }
-}
-
-static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
- const char *name, void *buffer, size_t size, int type)
-{
- struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
- struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
- struct btree_iter iter;
- struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
- inode_inum(inode), &search, 0);
- int ret = bkey_err(k);
- if (ret)
- return ret;
-
- struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- ret = le16_to_cpu(xattr.v->x_val_len);
- if (buffer) {
- if (ret > size)
- ret = -ERANGE;
- else
- memcpy(buffer, xattr_val(xattr.v), ret);
- }
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
- struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
- const char *name, const void *value, size_t size,
- int type, int flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = {};
- int ret;
-
- ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
- bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_intent);
- if (ret)
- return ret;
-
- /*
- * Besides the ctime update, extents, dirents and xattrs updates require
- * that an inode update also happens - to ensure that if a key exists in
- * one of those btrees with a given snapshot ID an inode is also present
- */
- inode_u->bi_ctime = bch2_current_time(c);
-
- ret = bch2_inode_write(trans, &inode_iter, inode_u);
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (ret)
- return ret;
-
- if (value) {
- struct bkey_i_xattr *xattr;
- unsigned namelen = strlen(name);
- unsigned u64s = BKEY_U64s +
- xattr_val_u64s(namelen, size);
-
- if (u64s > U8_MAX)
- return -ERANGE;
-
- xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(xattr))
- return PTR_ERR(xattr);
-
- bkey_xattr_init(&xattr->k_i);
- xattr->k.u64s = u64s;
- xattr->v.x_type = type;
- xattr->v.x_name_len = namelen;
- xattr->v.x_val_len = cpu_to_le16(size);
- memcpy(xattr->v.x_name_and_value, name, namelen);
- memcpy(xattr_val(&xattr->v), value, size);
-
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
- inum, &xattr->k_i,
- (flags & XATTR_CREATE ? STR_HASH_must_create : 0)|
- (flags & XATTR_REPLACE ? STR_HASH_must_replace : 0));
- } else {
- struct xattr_search_key search =
- X_SEARCH(type, name, strlen(name));
-
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
- hash_info, inum, &search);
- }
-
- if (bch2_err_matches(ret, ENOENT))
- ret = flags & XATTR_REPLACE ? -ENODATA : 0;
-
- return ret;
-}
-
-struct xattr_buf {
- char *buf;
- size_t len;
- size_t used;
-};
-
-static int __bch2_xattr_emit(const char *prefix,
- const char *name, size_t name_len,
- struct xattr_buf *buf)
-{
- const size_t prefix_len = strlen(prefix);
- const size_t total_len = prefix_len + name_len + 1;
-
- if (buf->buf) {
- if (buf->used + total_len > buf->len)
- return -ERANGE;
-
- memcpy(buf->buf + buf->used, prefix, prefix_len);
- memcpy(buf->buf + buf->used + prefix_len,
- name, name_len);
- buf->buf[buf->used + prefix_len + name_len] = '\0';
- }
-
- buf->used += total_len;
- return 0;
-}
-
-static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
-{
- const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
-
- if (!xattr_handler_can_list(handler, dentry))
- return NULL;
-
- return xattr_prefix(handler);
-}
-
-static int bch2_xattr_emit(struct dentry *dentry,
- const struct bch_xattr *xattr,
- struct xattr_buf *buf)
-{
- const char *prefix;
-
- prefix = bch2_xattr_prefix(xattr->x_type, dentry);
- if (!prefix)
- return 0;
-
- return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
-}
-
-static int bch2_xattr_list_bcachefs(struct bch_fs *c,
- struct bch_inode_unpacked *inode,
- struct xattr_buf *buf,
- bool all)
-{
- const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
- unsigned id;
- int ret = 0;
- u64 v;
-
- for (id = 0; id < Inode_opt_nr; id++) {
- v = bch2_inode_opt_get(inode, id);
- if (!v)
- continue;
-
- if (!all &&
- !(inode->bi_fields_set & (1 << id)))
- continue;
-
- ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
- strlen(bch2_inode_opts[id]), buf);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- struct bch_fs *c = dentry->d_sb->s_fs_info;
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
- u64 offset = 0, inum = inode->ei_inode.bi_inum;
-
- int ret = bch2_trans_run(c,
- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs,
- POS(inum, offset),
- POS(inum, U64_MAX),
- inode->ei_inum.subvol, 0, k, ({
- if (k.k->type != KEY_TYPE_xattr)
- continue;
-
- bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
- }))) ?:
- bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
- bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
-
- return ret ? bch2_err_class(ret) : buf.used;
-}
-
-static int bch2_xattr_get_handler(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_trans_do(c,
- bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
-
- if (ret < 0 && bch2_err_matches(ret, ENOENT))
- ret = -ENODATA;
-
- return bch2_err_class(ret);
-}
-
-static int bch2_xattr_set_handler(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *vinode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- struct bch_inode_unpacked inode_u;
- int ret;
-
- ret = bch2_trans_run(c,
- commit_do(trans, NULL, NULL, 0,
- bch2_xattr_set(trans, inode_inum(inode), &inode_u,
- &hash, name, value, size,
- handler->flags, flags)) ?:
- (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
-
- return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = bch2_xattr_get_handler,
- .set = bch2_xattr_set_handler,
- .flags = KEY_TYPE_XATTR_INDEX_USER,
-};
-
-static bool bch2_xattr_trusted_list(struct dentry *dentry)
-{
- return capable(CAP_SYS_ADMIN);
-}
-
-static const struct xattr_handler bch_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .list = bch2_xattr_trusted_list,
- .get = bch2_xattr_get_handler,
- .set = bch2_xattr_set_handler,
- .flags = KEY_TYPE_XATTR_INDEX_TRUSTED,
-};
-
-static const struct xattr_handler bch_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = bch2_xattr_get_handler,
- .set = bch2_xattr_set_handler,
- .flags = KEY_TYPE_XATTR_INDEX_SECURITY,
-};
-
-#ifndef NO_BCACHEFS_FS
-
-static int opt_to_inode_opt(int id)
-{
- switch (id) {
-#define x(name, ...) \
- case Opt_##name: return Inode_opt_##name;
- BCH_INODE_OPTS()
-#undef x
- default:
- return -1;
- }
-}
-
-static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size,
- bool all)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_opts opts =
- bch2_inode_opts_to_opts(&inode->ei_inode);
- const struct bch_option *opt;
- int id, inode_opt_id;
- struct printbuf out = PRINTBUF;
- int ret;
- u64 v;
-
- id = bch2_opt_lookup(name);
- if (id < 0 || !bch2_opt_is_inode_opt(id))
- return -EINVAL;
-
- inode_opt_id = opt_to_inode_opt(id);
- if (inode_opt_id < 0)
- return -EINVAL;
-
- opt = bch2_opt_table + id;
-
- if (!bch2_opt_defined_by_id(&opts, id))
- return -ENODATA;
-
- if (!all &&
- !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
- return -ENODATA;
-
- v = bch2_opt_get_by_id(&opts, id);
- bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
-
- ret = out.pos;
-
- if (out.allocation_failure) {
- ret = -ENOMEM;
- } else if (buffer) {
- if (out.pos > size)
- ret = -ERANGE;
- else
- memcpy(buffer, out.buf, out.pos);
- }
-
- printbuf_exit(&out);
- return ret;
-}
-
-static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size)
-{
- return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
- name, buffer, size, false);
-}
-
-struct inode_opt_set {
- int id;
- u64 v;
- bool defined;
-};
-
-static int inode_opt_set_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct inode_opt_set *s = p;
-
- if (s->id == Inode_opt_casefold) {
- int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v);
- if (ret)
- return ret;
- }
-
- if (s->defined)
- bi->bi_fields_set |= 1U << s->id;
- else
- bi->bi_fields_set &= ~(1U << s->id);
-
- bch2_inode_opt_set(bi, s->id, s->v);
-
- return 0;
-}
-
-static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *vinode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- const struct bch_option *opt;
- char *buf;
- struct inode_opt_set s;
- int opt_id, inode_opt_id, ret;
-
- opt_id = bch2_opt_lookup(name);
- if (opt_id < 0)
- return -EINVAL;
-
- opt = bch2_opt_table + opt_id;
-
- inode_opt_id = opt_to_inode_opt(opt_id);
- if (inode_opt_id < 0)
- return -EINVAL;
-
- s.id = inode_opt_id;
-
- if (value) {
- u64 v = 0;
-
- buf = kmalloc(size + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- memcpy(buf, value, size);
- buf[size] = '\0';
-
- ret = bch2_opt_parse(c, opt, buf, &v, NULL);
- kfree(buf);
-
- if (ret < 0)
- goto err_class_exit;
-
- ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
- if (ret < 0)
- goto err_class_exit;
-
- s.v = v + 1;
- s.defined = true;
- } else {
- /*
- * Check if this option was set on the parent - if so, switched
- * back to inheriting from the parent:
- *
- * rename() also has to deal with keeping inherited options up
- * to date - see bch2_reinherit_attrs()
- */
- spin_lock(&dentry->d_lock);
- if (!IS_ROOT(dentry)) {
- struct bch_inode_info *dir =
- to_bch_ei(d_inode(dentry->d_parent));
-
- s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
- } else {
- s.v = 0;
- }
- spin_unlock(&dentry->d_lock);
-
- s.defined = false;
- }
-
- mutex_lock(&inode->ei_update_lock);
- if (inode_opt_id == Inode_opt_project) {
- /*
- * inode fields accessible via the xattr interface are stored
- * with a +1 bias, so that 0 means unset:
- */
- ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
- if (ret)
- goto err;
- }
-
- ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
-err:
- mutex_unlock(&inode->ei_update_lock);
-err_class_exit:
- return bch2_err_class(ret);
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_handler = {
- .prefix = "bcachefs.",
- .get = bch2_xattr_bcachefs_get,
- .set = bch2_xattr_bcachefs_set,
-};
-
-static int bch2_xattr_bcachefs_get_effective(
- const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *vinode,
- const char *name, void *buffer, size_t size)
-{
- return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
- name, buffer, size, true);
-}
-
-/* Noop - xattrs in the bcachefs_effective namespace are inherited */
-static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
- struct mnt_idmap *idmap,
- struct dentry *dentry, struct inode *vinode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return 0;
-}
-
-static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
- .prefix = "bcachefs_effective.",
- .get = bch2_xattr_bcachefs_get_effective,
- .set = bch2_xattr_bcachefs_set_effective,
-};
-
-#endif /* NO_BCACHEFS_FS */
-
-const struct xattr_handler * const bch2_xattr_handlers[] = {
- &bch_xattr_user_handler,
- &bch_xattr_trusted_handler,
- &bch_xattr_security_handler,
-#ifndef NO_BCACHEFS_FS
- &bch_xattr_bcachefs_handler,
- &bch_xattr_bcachefs_effective_handler,
-#endif
- NULL
-};
-
-static const struct xattr_handler *bch_xattr_handler_map[] = {
- [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler,
- [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] =
- &nop_posix_acl_access,
- [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &nop_posix_acl_default,
- [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
- [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
-};
-
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
-{
- return type < ARRAY_SIZE(bch_xattr_handler_map)
- ? bch_xattr_handler_map[type]
- : NULL;
-}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
deleted file mode 100644
index 1139bf345f70..000000000000
--- a/fs/bcachefs/xattr.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_H
-#define _BCACHEFS_XATTR_H
-
-#include "str_hash.h"
-
-extern const struct bch_hash_desc bch2_xattr_hash_desc;
-
-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c,
- struct bkey_validate_context);
-void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-
-#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
- .key_validate = bch2_xattr_validate, \
- .val_to_text = bch2_xattr_to_text, \
- .min_val_size = 8, \
-})
-
-static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
-{
- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
- name_len + val_len, sizeof(u64));
-}
-
-#define xattr_val(_xattr) \
- ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)
-
-struct xattr_search_key {
- u8 type;
- struct qstr name;
-};
-
-#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \
- { .type = _type, .name = QSTR_INIT(_name, _len) })
-
-struct dentry;
-struct xattr_handler;
-struct bch_hash_info;
-struct bch_inode_info;
-
-/* Exported for cmd_migrate.c in tools: */
-int bch2_xattr_set(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *, const struct bch_hash_info *,
- const char *, const void *, size_t, int, int);
-
-ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
-
-extern const struct xattr_handler * const bch2_xattr_handlers[];
-
-#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
deleted file mode 100644
index 4121b78d9a92..000000000000
--- a/fs/bcachefs/xattr_format.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_XATTR_FORMAT_H
-#define _BCACHEFS_XATTR_FORMAT_H
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- /*
- * x_name contains the name and value counted by
- * x_name_len + x_val_len. The introduction of
- * __counted_by(x_name_len) previously caused a false positive
- * detection of an out of bounds write.
- */
- __u8 x_name_and_value[];
-} __packed __aligned(8);
-
-#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 264fba0d44bd..e4653bb99946 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = {
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
+static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags)
+{
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+ mm->saved_e_flags = flags;
+#endif
+}
+
+static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags)
+{
+#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS
+ flags = mm->saved_e_flags;
+#endif
+ return flags;
+}
+
/*
* We need to explicitly zero any trailing portion of the page that follows
* p_filesz when it ends before the page ends (e.g. bss), otherwise this
@@ -1290,6 +1305,8 @@ out_free_interp:
mm->end_data = end_data;
mm->start_stack = bprm->p;
+ elf_coredump_set_mm_eflags(mm, elf_ex->e_flags);
+
/**
* DOC: "brk" handling
*
@@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_thread_core_info *t;
struct elf_prpsinfo *psinfo;
struct core_thread *ct;
+ u16 machine;
+ u32 flags;
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
@@ -1831,30 +1850,37 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 0;
}
- /*
- * Initialize the ELF file header.
- */
- fill_elf_header(elf, phdrs,
- view->e_machine, view->e_flags);
+ machine = view->e_machine;
+ flags = view->e_flags;
#else
view = NULL;
info->thread_notes = 2;
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+ machine = ELF_ARCH;
+ flags = ELF_CORE_EFLAGS;
#endif
/*
+ * Override ELF e_flags with value taken from process,
+ * if arch needs that.
+ */
+ flags = elf_coredump_get_mm_eflags(dump_task->mm, flags);
+
+ /*
+ * Initialize the ELF file header.
+ */
+ fill_elf_header(elf, phdrs, machine, flags);
+
+ /*
* Allocate a structure for each thread.
*/
- info->thread = kzalloc(offsetof(struct elf_thread_core_info,
- notes[info->thread_notes]),
- GFP_KERNEL);
+ info->thread = kzalloc(struct_size(info->thread, notes, info->thread_notes),
+ GFP_KERNEL);
if (unlikely(!info->thread))
return 0;
info->thread->task = dump_task;
for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
- t = kzalloc(offsetof(struct elf_thread_core_info,
- notes[info->thread_notes]),
+ t = kzalloc(struct_size(t, notes, info->thread_notes),
GFP_KERNEL);
if (unlikely(!t))
return 0;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ea95c90c8474..4438637c8900 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -62,6 +62,7 @@ config BTRFS_FS_RUN_SANITY_TESTS
config BTRFS_DEBUG
bool "Btrfs debugging support"
depends on BTRFS_FS
+ select REF_TRACKER if STACKTRACE_SUPPORT
help
Enable run-time debugging support for the btrfs filesystem.
@@ -117,14 +118,3 @@ config BTRFS_EXPERIMENTAL
- large folio support
If unsure, say N.
-
-config BTRFS_FS_REF_VERIFY
- bool "Btrfs with the ref verify tool compiled in"
- depends on BTRFS_FS
- default n
- help
- Enable run-time extent reference verification instrumentation. This
- is meant to be used by btrfs developers for tracking down extent
- reference problems or verifying they didn't break something.
-
- If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 2d5f0482678b..743d7677b175 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,7 +36,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BTRFS_DEBUG) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 861c7d92c437..1248aa2535d3 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -44,7 +44,7 @@ static __always_inline void memcpy_split_src(char *dest, const char *src1,
* gives us all the type checking.
*
* The extent buffer pages stored in the array folios may not form a contiguous
- * phyusical range, but the API functions assume the linear offset to the range
+ * physical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6a450be293b1..2ab550a1e715 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -859,7 +859,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_pref(ref);
return PTR_ERR(eb);
}
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
@@ -1062,7 +1062,7 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref,
BTRFS_REF_TYPE_ANY);
- if (type == BTRFS_REF_TYPE_INVALID)
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID))
return -EUCLEAN;
offset = btrfs_extent_inline_ref_offset(leaf, iref);
@@ -1422,7 +1422,7 @@ again:
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -1614,7 +1614,7 @@ again:
ret = PTR_ERR(eb);
goto out;
}
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1652,7 +1652,7 @@ again:
* case.
*/
ASSERT(eie);
- if (!eie) {
+ if (unlikely(!eie)) {
ret = -EUCLEAN;
goto out;
}
@@ -1690,7 +1690,7 @@ out:
* @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are
* added to the ulist at @ctx->refs, and that ulist is allocated by this
* function. The caller should free the ulist with free_leaf_list() if
- * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is
+ * @ctx->ignore_extent_item_pos is false, otherwise a simple ulist_free() is
* enough.
*
* Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated.
@@ -2215,7 +2215,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2312,7 +2312,7 @@ static int get_extent_inline_ref(unsigned long *ptr,
*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
*out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref,
BTRFS_REF_TYPE_ANY);
- if (*out_type == BTRFS_REF_TYPE_INVALID)
+ if (unlikely(*out_type == BTRFS_REF_TYPE_INVALID))
return -EUCLEAN;
*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -2868,7 +2868,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2876,7 +2876,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
ret = -EUCLEAN;
goto release;
}
- if (path->slots[0] == 0) {
+ if (unlikely(path->slots[0] == 0)) {
DEBUG_WARN();
ret = -EUCLEAN;
goto release;
@@ -3457,7 +3457,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
/* No extra backref? This means the tree block is corrupted */
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -3500,7 +3500,7 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
((unsigned long)iter->cur_ptr);
type = btrfs_get_extent_inline_ref_type(eb, iref,
BTRFS_REF_TYPE_BLOCK);
- if (type == BTRFS_REF_TYPE_INVALID) {
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
ret = -EUCLEAN;
goto out;
}
@@ -3612,7 +3612,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
}
/* Sanity check, we shouldn't have any unchecked nodes */
- if (!upper->checked) {
+ if (unlikely(!upper->checked)) {
DEBUG_WARN("we should not have any unchecked nodes");
return -EUCLEAN;
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 34b0193a181c..25d51c246070 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -190,7 +190,7 @@ struct btrfs_backref_share_check_ctx {
* It's very common to have several file extent items that point to the
* same extent (bytenr) but with different offsets and lengths. This
* typically happens for COW writes, partial writes into prealloc
- * extents, NOCOW writes after snapshoting a root, hole punching or
+ * extents, NOCOW writes after snapshotting a root, hole punching or
* reflinking within the same file (less common perhaps).
* So keep a small cache with the lookup results for the extent pointed
* by the last few file extent items. This cache is checked, with a
@@ -414,7 +414,7 @@ struct btrfs_backref_cache {
/*
* Whether this cache is for relocation
*
- * Reloction backref cache require more info for reloc root compared
+ * Relocation backref cache require more info for reloc root compared
* to generic backref cache.
*/
bool is_reloc;
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 50b5fc1c06d7..21df48e6c4fa 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -93,6 +93,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
refcount_inc(&orig_bbio->ordered->refs);
bbio->ordered = orig_bbio->ordered;
}
+ bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
atomic_inc(&orig_bbio->pending_ios);
return bbio;
}
@@ -166,7 +167,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
int mirror = repair_bbio->mirror_num;
if (repair_bbio->bio.bi_status ||
- !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
+ !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
@@ -203,18 +204,21 @@ done:
*/
static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
u32 bio_offset,
- struct bio_vec *bv,
+ phys_addr_t paddr,
struct btrfs_failed_bio *fbio)
{
struct btrfs_inode *inode = failed_bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct folio *folio = page_folio(phys_to_page(paddr));
const u32 sectorsize = fs_info->sectorsize;
+ const u32 foff = offset_in_folio(folio, paddr);
const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
struct btrfs_bio *repair_bbio;
struct bio *repair_bio;
int num_copies;
int mirror;
+ ASSERT(foff + sectorsize <= folio_size(folio));
btrfs_debug(fs_info, "repair read error: read error at %llu",
failed_bbio->file_offset + bio_offset);
@@ -237,7 +241,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
&btrfs_repair_bioset);
repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
- __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+ bio_add_folio_nofail(repair_bio, folio, sectorsize, foff);
repair_bbio = btrfs_bio(repair_bio);
btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
@@ -258,6 +262,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
struct bvec_iter *iter = &bbio->saved_iter;
blk_status_t status = bbio->bio.bi_status;
struct btrfs_failed_bio *fbio = NULL;
+ phys_addr_t paddr;
u32 offset = 0;
/* Read-repair requires the inode field to be set by the submitter. */
@@ -275,17 +280,11 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
/* Clear the I/O error. A failed repair will reset it. */
bbio->bio.bi_status = BLK_STS_OK;
- while (iter->bi_size) {
- struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);
-
- bv.bv_len = min(bv.bv_len, sectorsize);
- if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
- fbio = repair_one_sector(bbio, offset, &bv, fbio);
-
- bio_advance_iter_single(&bbio->bio, iter, sectorsize);
+ btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) {
+ if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr))
+ fbio = repair_one_sector(bbio, offset, paddr, fbio);
offset += sectorsize;
}
-
if (bbio->csum != bbio->csum_inline)
kfree(bbio->csum);
@@ -780,11 +779,38 @@ end_bbio:
return true;
}
+static void assert_bbio_alignment(struct btrfs_bio *bbio)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ const u32 blocksize = fs_info->sectorsize;
+
+ /* Metadata has no extra bs > ps alignment requirement. */
+ if (!is_data_bbio(bbio))
+ return;
+
+ bio_for_each_bvec(bvec, &bbio->bio, iter)
+ ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) &&
+ IS_ALIGNED(bvec.bv_len, blocksize),
+ "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
+ btrfs_root_id(bbio->inode->root),
+ btrfs_ino(bbio->inode),
+ bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT,
+ bbio->bio.bi_iter.bi_size, iter.bi_idx,
+ bvec.bv_offset,
+ bvec.bv_len);
+#endif
+}
+
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
+ assert_bbio_alignment(bbio);
+
while (!btrfs_submit_chunk(bbio, mirror_num))
;
}
@@ -823,8 +849,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (ret < 0)
goto out_counter_dec;
- if (!smap.dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
+ if (unlikely(!smap.dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
ret = -EIO;
goto out_counter_dec;
}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index dc2eb43b7097..00883aea55d7 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -82,6 +82,8 @@ struct btrfs_bio {
/* Save the first error status of split bio. */
blk_status_t status;
+ /* Use the commit root to look up csums (data read bio only). */
+ bool csum_search_commit_root;
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9bf282d2453c..5322ef2ae015 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1358,7 +1358,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
-static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
- return bg1->used > bg2->used;
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -1964,7 +1971,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
* called, which is where we will transfer a reserved extent's
* size from the "reserved" counter to the "used" counter - this
* happens when running delayed references. When we relocate the
- * chunk below, relocation first flushes dellaloc, waits for
+ * chunk below, relocation first flushes delalloc, waits for
* ordered extent completion (which is where we create delayed
* references for data extents) and commits the current
* transaction (which runs delayed references), and only after
@@ -2031,7 +2038,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
- queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
spin_unlock(&fs_info->unused_bgs_lock);
}
@@ -2064,7 +2071,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
return -ENOENT;
}
- if (map->start != key->objectid || map->chunk_len != key->offset) {
+ if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, map->start, map->chunk_len);
@@ -2077,7 +2084,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
@@ -2238,7 +2245,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return ret;
/* Shouldn't have super stripes in sequential zones */
- if (zoned && nr) {
+ if (unlikely(zoned && nr)) {
kfree(logical);
btrfs_err(fs_info,
"zoned: block group %llu must not contain super block",
@@ -2329,7 +2336,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
break;
bg = btrfs_lookup_block_group(fs_info, map->start);
- if (!bg) {
+ if (unlikely(!bg)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
map->start, map->chunk_len);
@@ -2337,9 +2344,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
btrfs_free_chunk_map(map);
break;
}
- if (bg->start != map->start || bg->length != map->chunk_len ||
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
map->start, map->chunk_len,
@@ -2832,7 +2839,7 @@ next:
* space or none at all (due to no need to COW, extent buffers
* were already COWed in the current transaction and still
* unwritten, tree heights lower than the maximum possible
- * height, etc). For data we generally reserve the axact amount
+ * height, etc). For data we generally reserve the exact amount
* of space we are going to allocate later, the exception is
* when using compression, as we must reserve space based on the
* uncompressed data size, because the compression is only done
@@ -3241,7 +3248,7 @@ again:
*/
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
/*
* So theoretically we could recover from this, simply set the
* super cache generation to 0 so we know to invalidate the
@@ -3988,7 +3995,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
struct btrfs_space_info *sys_space_info;
sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
- if (!sys_space_info) {
+ if (unlikely(!sys_space_info)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -4002,17 +4009,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index a8bb8429c966..9172104a5889 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -63,7 +63,7 @@ enum btrfs_discard_state {
* CHUNK_ALLOC_FORCE means it must try to allocate one
*
* CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
- * find_free_extent() that also activaes the zone
+ * find_free_extent() that also activates the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b99fb0273292..af373d50a901 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -248,7 +248,7 @@ struct btrfs_inode {
u64 new_delalloc_bytes;
/*
* The offset of the last dir index key that was logged.
- * This is used only for directories.
+ * This is used only for directories. Protected by 'log_mutex'.
*/
u64 last_dir_index_offset;
};
@@ -338,6 +338,11 @@ struct btrfs_inode {
struct list_head delayed_iput;
struct rw_semaphore i_mmap_lock;
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
+
struct inode vfs_inode;
};
@@ -532,9 +537,9 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
/* We only allow BITS_PER_LONGS blocks for each bitmap. */
#ifdef CONFIG_BTRFS_EXPERIMENTAL
- mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0,
- ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits)
- >> PAGE_SHIFT)));
+ mapping_set_folio_order_range(inode->vfs_inode.i_mapping,
+ inode->root->fs_info->block_min_order,
+ inode->root->fs_info->block_max_order);
#endif
}
@@ -542,10 +547,12 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
- const u8 * const csum_expected);
+void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr,
+ u8 *dest);
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+ const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
- u32 bio_offset, struct bio_vec *bv);
+ u32 bio_offset, phys_addr_t paddr);
noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait);
@@ -558,7 +565,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name);
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const struct fscrypt_str *name, int add_backref, u64 index);
+ const struct fscrypt_str *name, bool add_backref, u64 index);
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d09d622016ef..bacad18357b3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -90,19 +90,19 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
}
static int compression_compress_pages(int type, struct list_head *ws,
- struct address_space *mapping, u64 start,
+ struct btrfs_inode *inode, u64 start,
struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
- return zlib_compress_folios(ws, mapping, start, folios,
+ return zlib_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_LZO:
- return lzo_compress_folios(ws, mapping, start, folios,
+ return lzo_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
- return zstd_compress_folios(ws, mapping, start, folios,
+ return zstd_compress_folios(ws, inode, start, folios,
out_folios, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
@@ -223,10 +223,14 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co
/*
* Common wrappers for page allocation from compression wrappers
*/
-struct folio *btrfs_alloc_compr_folio(void)
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info)
{
struct folio *folio = NULL;
+ /* For bs > ps cases, no cached folio pool for now. */
+ if (fs_info->block_min_order)
+ goto alloc;
+
spin_lock(&compr_pool.lock);
if (compr_pool.count > 0) {
folio = list_first_entry(&compr_pool.list, struct folio, lru);
@@ -238,13 +242,18 @@ struct folio *btrfs_alloc_compr_folio(void)
if (folio)
return folio;
- return folio_alloc(GFP_NOFS, 0);
+alloc:
+ return folio_alloc(GFP_NOFS, fs_info->block_min_order);
}
void btrfs_free_compr_folio(struct folio *folio)
{
bool do_free = false;
+ /* The folio is from bs > ps fs, no cached pool for now. */
+ if (folio_order(folio))
+ goto free;
+
spin_lock(&compr_pool.lock);
if (compr_pool.count > compr_pool.thresh) {
do_free = true;
@@ -257,6 +266,7 @@ void btrfs_free_compr_folio(struct folio *folio)
if (!do_free)
return;
+free:
ASSERT(folio_ref_count(folio) == 1);
folio_put(folio);
}
@@ -344,16 +354,19 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio)
static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb->bbio.fs_info;
struct bio *bio = &cb->bbio.bio;
u32 offset = 0;
while (offset < cb->compressed_len) {
+ struct folio *folio;
int ret;
- u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
+ u32 len = min_t(u32, cb->compressed_len - offset,
+ btrfs_min_folio_size(fs_info));
+ folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)];
/* Maximum compressed extent is smaller than bio size limit. */
- ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT],
- len, 0);
+ ret = bio_add_folio(bio, folio, len, 0);
ASSERT(ret);
offset += len;
}
@@ -443,6 +456,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (fs_info->sectorsize < PAGE_SIZE)
return 0;
+ /* For bs > ps cases, we don't support readahead for compressed folios for now. */
+ if (fs_info->block_min_order)
+ return 0;
+
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (cur < compressed_end) {
@@ -602,17 +619,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->compressed_len = compressed_len;
cb->compress_type = btrfs_extent_map_compression(em);
cb->orig_bbio = bbio;
+ cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
btrfs_free_extent_map(em);
- cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
if (!cb->compressed_folios) {
status = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
+ ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
+ cb->compressed_folios);
if (ret) {
status = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -687,8 +706,6 @@ struct heuristic_ws {
struct list_head list;
};
-static struct workspace_manager heuristic_wsm;
-
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -701,7 +718,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info)
{
struct heuristic_ws *ws;
@@ -728,11 +745,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-const struct btrfs_compress_op btrfs_heuristic_compress = {
- .workspace_manager = &heuristic_wsm,
-};
+const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 };
-static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+static const struct btrfs_compress_levels * const btrfs_compress_levels[] = {
/* The heuristic is represented as compression type 0 */
&btrfs_heuristic_compress,
&btrfs_zlib_compress,
@@ -740,13 +755,13 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-static struct list_head *alloc_workspace(int type, int level)
+static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
- case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
- case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
- case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level);
default:
/*
* This can't happen, the type is validated several times
@@ -772,44 +787,58 @@ static void free_workspace(int type, struct list_head *ws)
}
}
-static void btrfs_init_workspace_manager(int type)
+static int alloc_workspace_manager(struct btrfs_fs_info *fs_info,
+ enum btrfs_compression_type type)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *gwsm;
struct list_head *workspace;
- wsm = btrfs_compress_op[type]->workspace_manager;
- INIT_LIST_HEAD(&wsm->idle_ws);
- spin_lock_init(&wsm->ws_lock);
- atomic_set(&wsm->total_ws, 0);
- init_waitqueue_head(&wsm->ws_wait);
+ ASSERT(fs_info->compr_wsm[type] == NULL);
+ gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL);
+ if (!gwsm)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&gwsm->idle_ws);
+ spin_lock_init(&gwsm->ws_lock);
+ atomic_set(&gwsm->total_ws, 0);
+ init_waitqueue_head(&gwsm->ws_wait);
+ fs_info->compr_wsm[type] = gwsm;
/*
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
- workspace = alloc_workspace(type, 0);
+ workspace = alloc_workspace(fs_info, type, 0);
if (IS_ERR(workspace)) {
- btrfs_warn(NULL,
- "cannot preallocate compression workspace, will try later");
+ btrfs_warn(fs_info,
+ "cannot preallocate compression workspace for %s, will try later",
+ btrfs_compress_type2str(type));
} else {
- atomic_set(&wsm->total_ws, 1);
- wsm->free_ws = 1;
- list_add(workspace, &wsm->idle_ws);
+ atomic_set(&gwsm->total_ws, 1);
+ gwsm->free_ws = 1;
+ list_add(workspace, &gwsm->idle_ws);
}
+ return 0;
}
-static void btrfs_cleanup_workspace_manager(int type)
+static void free_workspace_manager(struct btrfs_fs_info *fs_info,
+ enum btrfs_compression_type type)
{
- struct workspace_manager *wsman;
struct list_head *ws;
+ struct workspace_manager *gwsm = fs_info->compr_wsm[type];
- wsman = btrfs_compress_op[type]->workspace_manager;
- while (!list_empty(&wsman->idle_ws)) {
- ws = wsman->idle_ws.next;
+ /* ZSTD uses its own workspace manager, should enter here. */
+ ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES);
+ if (!gwsm)
+ return;
+ fs_info->compr_wsm[type] = NULL;
+ while (!list_empty(&gwsm->idle_ws)) {
+ ws = gwsm->idle_ws.next;
list_del(ws);
free_workspace(type, ws);
- atomic_dec(&wsman->total_ws);
+ atomic_dec(&gwsm->total_ws);
}
+ kfree(gwsm);
}
/*
@@ -818,9 +847,9 @@ static void btrfs_cleanup_workspace_manager(int type)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(int type, int level)
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *wsm = fs_info->compr_wsm[type];
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
@@ -830,7 +859,7 @@ struct list_head *btrfs_get_workspace(int type, int level)
wait_queue_head_t *ws_wait;
int *free_ws;
- wsm = btrfs_compress_op[type]->workspace_manager;
+ ASSERT(wsm);
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -866,7 +895,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = alloc_workspace(type, level);
+ workspace = alloc_workspace(fs_info, type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -889,7 +918,7 @@ again:
/* no burst */ 1);
if (__ratelimit(&_rs))
- btrfs_warn(NULL,
+ btrfs_warn(fs_info,
"no compression workspaces, low memory, retrying");
}
goto again;
@@ -897,13 +926,13 @@ again:
return workspace;
}
-static struct list_head *get_workspace(int type, int level)
+static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
- case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
- case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
- case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level);
default:
/*
* This can't happen, the type is validated several times
@@ -917,21 +946,21 @@ static struct list_head *get_workspace(int type, int level)
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-void btrfs_put_workspace(int type, struct list_head *ws)
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
{
- struct workspace_manager *wsm;
+ struct workspace_manager *gwsm = fs_info->compr_wsm[type];
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- wsm = btrfs_compress_op[type]->workspace_manager;
- idle_ws = &wsm->idle_ws;
- ws_lock = &wsm->ws_lock;
- total_ws = &wsm->total_ws;
- ws_wait = &wsm->ws_wait;
- free_ws = &wsm->free_ws;
+ ASSERT(gwsm);
+ idle_ws = &gwsm->idle_ws;
+ ws_lock = &gwsm->ws_lock;
+ total_ws = &gwsm->total_ws;
+ ws_wait = &gwsm->ws_wait;
+ free_ws = &gwsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
@@ -948,13 +977,13 @@ wake:
cond_wake_up(ws_wait);
}
-static void put_workspace(int type, struct list_head *ws)
+static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
- case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws);
default:
/*
* This can't happen, the type is validated several times
@@ -970,12 +999,12 @@ static void put_workspace(int type, struct list_head *ws)
*/
static int btrfs_compress_set_level(unsigned int type, int level)
{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+ const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
if (level == 0)
- level = ops->default_level;
+ level = levels->default_level;
else
- level = clamp(level, ops->min_level, ops->max_level);
+ level = clamp(level, levels->min_level, levels->max_level);
return level;
}
@@ -985,9 +1014,9 @@ static int btrfs_compress_set_level(unsigned int type, int level)
*/
bool btrfs_compress_level_valid(unsigned int type, int level)
{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+ const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
- return ops->min_level <= level && level <= ops->max_level;
+ return levels->min_level <= level && level <= levels->max_level;
}
/* Wrapper around find_get_page(), with extra error message. */
@@ -1022,44 +1051,46 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
* - compression algo are 0-3
* - the level are bits 4-7
*
- * @out_pages is an in/out parameter, holds maximum number of pages to allocate
- * and returns number of actually allocated pages
+ * @out_folios is an in/out parameter, holds maximum number of folios to allocate
+ * and returns number of actually allocated folios
*
* @total_in is used to return the number of bytes actually read. It
* may be smaller than the input length if we had to exit early because we
- * ran out of room in the pages array or because we cross the
+ * ran out of room in the folios array or because we cross the
* max_out threshold.
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
level = btrfs_compress_set_level(type, level);
- workspace = get_workspace(type, level);
- ret = compression_compress_pages(type, workspace, mapping, start, folios,
+ workspace = get_workspace(fs_info, type, level);
+ ret = compression_compress_pages(type, workspace, inode, start, folios,
out_folios, total_in, total_out);
/* The total read-in bytes should be no larger than the input. */
ASSERT(*total_in <= orig_len);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
return ret;
}
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct list_head *workspace;
int ret;
int type = cb->compress_type;
- workspace = get_workspace(type, 0);
+ workspace = get_workspace(fs_info, type, 0);
ret = compression_decompress_bio(workspace, cb);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
if (!ret)
zero_fill_bio(&cb->orig_bbio->bio);
@@ -1080,20 +1111,50 @@ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
int ret;
/*
- * The full destination page range should not exceed the page size.
+ * The full destination folio range should not exceed the folio size.
* And the @destlen should not exceed sectorsize, as this is only called for
* inline file extents, which should not exceed sectorsize.
*/
- ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+ ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize);
- workspace = get_workspace(type, 0);
+ workspace = get_workspace(fs_info, type, 0);
ret = compression_decompress(type, workspace, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- put_workspace(type, workspace);
+ put_workspace(fs_info, type, workspace);
return ret;
}
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+ if (ret < 0)
+ goto error;
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+ if (ret < 0)
+ goto error;
+ ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+ if (ret < 0)
+ goto error;
+ ret = zstd_alloc_workspace_manager(fs_info);
+ if (ret < 0)
+ goto error;
+ return 0;
+error:
+ btrfs_free_compress_wsm(fs_info);
+ return ret;
+}
+
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info)
+{
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
+ free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
+ zstd_free_workspace_manager(fs_info);
+}
+
int __init btrfs_init_compress(void)
{
if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
@@ -1105,11 +1166,6 @@ int __init btrfs_init_compress(void)
if (!compr_pool.shrinker)
return -ENOMEM;
- btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
- btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
- btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
- zstd_init_workspace_manager();
-
spin_lock_init(&compr_pool.lock);
INIT_LIST_HEAD(&compr_pool.list);
compr_pool.count = 0;
@@ -1130,10 +1186,6 @@ void __cold btrfs_exit_compress(void)
btrfs_compr_pool_scan(NULL, NULL);
shrinker_free(compr_pool.shrinker);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
- btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
- zstd_cleanup_workspace_manager();
bioset_exit(&btrfs_compressed_bioset);
}
@@ -1256,7 +1308,7 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
#define ENTROPY_LVL_HIGH (80)
/*
- * For increasead precision in shannon_entropy calculation,
+ * For increased precision in shannon_entropy calculation,
* let's do pow(n, M) to save more digits after comma:
*
* - maximum int bit length is 64
@@ -1542,7 +1594,8 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = get_workspace(0, 0);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct list_head *ws_list = get_workspace(fs_info, 0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1611,30 +1664,34 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
}
out:
- put_workspace(0, ws_list);
+ put_workspace(fs_info, 0, ws_list);
return ret;
}
/*
- * Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level. Negative level
- * numbers are allowed.
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to level.
+ *
+ * If the resulting level exceeds the algo's supported levels, it will be clamped.
+ *
+ * Return <0 if no valid string can be found.
+ * Return 0 if everything is fine.
*/
-int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
{
int level = 0;
int ret;
- if (!type)
+ if (!type) {
+ *level_ret = btrfs_compress_set_level(type, level);
return 0;
+ }
if (str[0] == ':') {
ret = kstrtoint(str + 1, 10, &level);
if (ret)
- level = 0;
+ return ret;
}
- level = btrfs_compress_set_level(type, level);
-
- return level;
+ *level_ret = btrfs_compress_set_level(type, level);
+ return 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1b38e707bbd9..eba188a9e3bb 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -75,6 +75,11 @@ struct compressed_bio {
struct btrfs_bio bbio;
};
+static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb)
+{
+ return cb->bbio.fs_info;
+}
+
/* @range_end must be exclusive. */
static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur)
{
@@ -84,11 +89,14 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6
return min(range_end, folio_end(folio)) - cur;
}
+int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info);
+void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info);
+
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
bool btrfs_compress_level_valid(unsigned int type, int level);
-int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
@@ -102,21 +110,11 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
-struct folio *btrfs_alloc_compr_folio(void);
+struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info);
void btrfs_free_compr_folio(struct folio *folio);
-enum btrfs_compression_type {
- BTRFS_COMPRESS_NONE = 0,
- BTRFS_COMPRESS_ZLIB = 1,
- BTRFS_COMPRESS_LZO = 2,
- BTRFS_COMPRESS_ZSTD = 3,
- BTRFS_NR_COMPRESS_TYPES = 4,
-
- BTRFS_DEFRAG_DONT_COMPRESS,
-};
-
struct workspace_manager {
struct list_head idle_ws;
spinlock_t ws_lock;
@@ -128,11 +126,10 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-struct list_head *btrfs_get_workspace(int type, int level);
-void btrfs_put_workspace(int type, struct list_head *ws);
+struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level);
+void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws);
-struct btrfs_compress_op {
- struct workspace_manager *workspace_manager;
+struct btrfs_compress_levels {
/* Maximum level supported by the compression algorithm */
int min_level;
int max_level;
@@ -142,10 +139,10 @@ struct btrfs_compress_op {
/* The heuristic workspaces are managed via the 0th workspace manager */
#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
-extern const struct btrfs_compress_op btrfs_heuristic_compress;
-extern const struct btrfs_compress_op btrfs_zlib_compress;
-extern const struct btrfs_compress_op btrfs_lzo_compress;
-extern const struct btrfs_compress_op btrfs_zstd_compress;
+extern const struct btrfs_compress_levels btrfs_heuristic_compress;
+extern const struct btrfs_compress_levels btrfs_zlib_compress;
+extern const struct btrfs_compress_levels btrfs_lzo_compress;
+extern const struct btrfs_compress_levels btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
@@ -155,39 +152,39 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret);
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level);
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *lzo_alloc_workspace(void);
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info);
void lzo_free_workspace(struct list_head *ws);
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(int level);
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info);
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info);
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level);
void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(int level);
-void zstd_put_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level);
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 74e6d7f3d266..561658aca018 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -30,10 +30,10 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *ins_key, struct btrfs_path *path,
- int data_size, int extend);
+ int data_size, bool extend);
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
- struct extent_buffer *src, int empty);
+ struct extent_buffer *src, bool empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
@@ -293,11 +293,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_inc_ref(trans, root, cow, 1);
- if (ret)
+ if (unlikely(ret))
btrfs_abort_transaction(trans, ret);
} else {
ret = btrfs_inc_ref(trans, root, cow, 0);
- if (ret)
+ if (unlikely(ret))
btrfs_abort_transaction(trans, ret);
}
if (ret) {
@@ -536,14 +536,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -556,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -567,7 +567,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -575,7 +575,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -586,14 +586,14 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(trans, parent);
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
}
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
parent_start, last_ref);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
@@ -613,15 +613,12 @@ error_unlock_cow:
return ret;
}
-static inline int should_cow_block(const struct btrfs_trans_handle *trans,
- const struct btrfs_root *root,
- const struct extent_buffer *buf)
+static inline bool should_cow_block(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
if (btrfs_is_testing(root->fs_info))
- return 0;
-
- /* Ensure we can see the FORCE_COW bit */
- smp_mb__before_atomic();
+ return false;
/*
* We do not need to cow a block if
@@ -634,13 +631,25 @@ static inline int should_cow_block(const struct btrfs_trans_handle *trans,
* after we've finished copying src root, we must COW the shared
* block to ensure the metadata consistency.
*/
- if (btrfs_header_generation(buf) == trans->transid &&
- !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
- !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
- !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
- return 0;
- return 1;
+
+ if (btrfs_header_generation(buf) != trans->transid)
+ return true;
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
+ return true;
+
+ /* Ensure we can see the FORCE_COW bit. */
+ smp_mb__before_atomic();
+ if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
+ return true;
+
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
+ return false;
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return true;
+
+ return false;
}
/*
@@ -844,7 +853,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
&check);
if (IS_ERR(eb))
return eb;
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
return ERR_PTR(-EIO);
}
@@ -913,7 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
btrfs_abort_transaction(trans, ret);
@@ -935,7 +944,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1010,7 +1019,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1019,7 +1028,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1071,7 +1080,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1081,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1186,7 +1195,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(left);
free_extent_buffer(left);
btrfs_abort_transaction(trans, ret);
@@ -1246,7 +1255,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_tree_unlock(right);
free_extent_buffer(right);
btrfs_abort_transaction(trans, ret);
@@ -1484,13 +1493,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
- if (btrfs_verify_level_key(tmp, &check)) {
+ if (unlikely(btrfs_verify_level_key(tmp, &check))) {
ret = -EUCLEAN;
goto out;
}
@@ -1571,7 +1580,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!extent_buffer_uptodate(tmp)) {
+ if (unlikely(!extent_buffer_uptodate(tmp))) {
ret = -EIO;
goto out;
}
@@ -1752,7 +1761,7 @@ out:
* The root may have failed to write out at some point, and thus is no
* longer valid, return an error in this case.
*/
- if (!extent_buffer_uptodate(b)) {
+ if (unlikely(!extent_buffer_uptodate(b))) {
if (root_lock)
btrfs_tree_unlock_rw(b, root_lock);
free_extent_buffer(b);
@@ -2260,7 +2269,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = btrfs_get_old_root(root, time_seq);
- if (!b) {
+ if (unlikely(!b)) {
ret = -EIO;
goto done;
}
@@ -2686,7 +2695,7 @@ static bool check_sibling_keys(const struct extent_buffer *left,
*/
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
- struct extent_buffer *src, int empty)
+ struct extent_buffer *src, bool empty)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
@@ -2722,13 +2731,13 @@ static int push_node_left(struct btrfs_trans_handle *trans,
push_items = min(src_nritems - 8, push_items);
/* dst is the left eb, src is the middle eb */
- if (check_sibling_keys(dst, src)) {
+ if (unlikely(check_sibling_keys(dst, src))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2796,7 +2805,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
push_items = max_push;
/* dst is the right eb, src is the middle eb */
- if (check_sibling_keys(src, dst)) {
+ if (unlikely(check_sibling_keys(src, dst))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
@@ -2813,7 +2822,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
push_items);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2883,7 +2892,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
- if (ret2 < 0)
+ if (unlikely(ret2 < 0))
btrfs_abort_transaction(trans, ret2);
btrfs_tree_unlock(c);
free_extent_buffer(c);
@@ -2928,7 +2937,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
slot, nritems - slot);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2941,7 +2950,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
BTRFS_MOD_LOG_KEY_ADD);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3017,7 +3026,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_tree_unlock(split);
free_extent_buffer(split);
btrfs_abort_transaction(trans, ret);
@@ -3086,7 +3095,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
int ret;
ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_crit(fs_info,
"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
ret,
@@ -3102,7 +3111,7 @@ int btrfs_leaf_free_space(const struct extent_buffer *leaf)
*/
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- int data_size, int empty,
+ int data_size, bool empty,
struct extent_buffer *right,
int free_space, u32 left_nritems,
u32 min_slot)
@@ -3239,7 +3248,7 @@ out_unlock:
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
int min_data_size, int data_size,
- int empty, u32 min_slot)
+ bool empty, u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
@@ -3278,7 +3287,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
- if (check_sibling_keys(left, right)) {
+ if (unlikely(check_sibling_keys(left, right))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
btrfs_tree_unlock(right);
@@ -3316,7 +3325,7 @@ out_unlock:
*/
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int data_size,
- int empty, struct extent_buffer *left,
+ bool empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
u32 max_slot)
{
@@ -3494,7 +3503,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- if (check_sibling_keys(left, right)) {
+ if (unlikely(check_sibling_keys(left, right))) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3642,7 +3651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const struct btrfs_key *ins_key,
struct btrfs_path *path, int data_size,
- int extend)
+ bool extend)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *l;
@@ -4075,7 +4084,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4108,7 +4117,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
- if (slot >= nritems) {
+ if (unlikely(slot >= nritems)) {
btrfs_print_leaf(leaf);
btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
slot, nritems);
@@ -4135,7 +4144,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4183,7 +4192,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
data_end = leaf_data_end(leaf);
total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
- if (btrfs_leaf_free_space(leaf) < total_size) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info, "not enough freespace need %u have %d",
total_size, btrfs_leaf_free_space(leaf));
@@ -4193,7 +4202,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
if (slot != nritems) {
unsigned int old_data = btrfs_item_data_end(leaf, slot);
- if (old_data < data_end) {
+ if (unlikely(old_data < data_end)) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info,
"item at slot %d with data offset %u beyond data end of leaf %u",
@@ -4232,7 +4241,7 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(trans, leaf);
- if (btrfs_leaf_free_space(leaf) < 0) {
+ if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
btrfs_print_leaf(leaf);
BUG();
}
@@ -4374,7 +4383,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (level) {
ret = btrfs_tree_mod_log_insert_move(parent, slot,
slot + 1, nritems - slot - 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -4387,7 +4396,7 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
BTRFS_MOD_LOG_KEY_REMOVE);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 738179a5e170..7b277934f66f 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -153,7 +153,7 @@ void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
}
/*
- * Pick the defragable inode that we want, if it doesn't exist, we will get the
+ * Pick the defraggable inode that we want, if it doesn't exist, we will get the
* next one.
*/
static struct inode_defrag *btrfs_pick_defrag_inode(
@@ -924,7 +924,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EIO);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0f8d8e275143..41e37f7f67cc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -57,6 +57,7 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
+ btrfs_delayed_node_ref_tracker_dir_init(delayed_node);
delayed_node->ins_root = RB_ROOT_CACHED;
delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
@@ -65,7 +66,8 @@ static inline void btrfs_init_delayed_node(
}
static struct btrfs_delayed_node *btrfs_get_delayed_node(
- struct btrfs_inode *btrfs_inode)
+ struct btrfs_inode *btrfs_inode,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
@@ -74,6 +76,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = READ_ONCE(btrfs_inode->delayed_node);
if (node) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_NOFS);
return node;
}
@@ -83,6 +86,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
BUG_ON(btrfs_inode->delayed_node != node);
xa_unlock(&root->delayed_nodes);
return node;
@@ -106,6 +110,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
*/
if (refcount_inc_not_zero(&node->refs)) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker,
+ GFP_ATOMIC);
btrfs_inode->delayed_node = node;
} else {
node = NULL;
@@ -126,7 +133,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* Return the delayed node, or error pointer on failure.
*/
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
- struct btrfs_inode *btrfs_inode)
+ struct btrfs_inode *btrfs_inode,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_node *node;
struct btrfs_root *root = btrfs_inode->root;
@@ -135,7 +143,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
void *ptr;
again:
- node = btrfs_get_delayed_node(btrfs_inode);
+ node = btrfs_get_delayed_node(btrfs_inode, tracker);
if (node)
return node;
@@ -144,12 +152,10 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* Cached in the inode and can be accessed. */
- refcount_set(&node->refs, 2);
-
/* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
if (ret == -ENOMEM) {
+ btrfs_delayed_node_ref_tracker_dir_exit(node);
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(-ENOMEM);
}
@@ -158,6 +164,7 @@ again:
if (ptr) {
/* Somebody inserted it, go back and read it. */
xa_unlock(&root->delayed_nodes);
+ btrfs_delayed_node_ref_tracker_dir_exit(node);
kmem_cache_free(delayed_node_cache, node);
node = NULL;
goto again;
@@ -166,6 +173,12 @@ again:
ASSERT(xa_err(ptr) != -EINVAL);
ASSERT(xa_err(ptr) != -ENOMEM);
ASSERT(ptr == NULL);
+
+ /* Cached in the inode and can be accessed. */
+ refcount_set(&node->refs, 2);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->inode_cache_tracker, GFP_ATOMIC);
+
btrfs_inode->delayed_node = node;
xa_unlock(&root->delayed_nodes);
@@ -191,6 +204,8 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
list_add_tail(&node->n_list, &root->node_list);
list_add_tail(&node->p_list, &root->prepare_list);
refcount_inc(&node->refs); /* inserted into list */
+ btrfs_delayed_node_ref_tracker_alloc(node, &node->node_list_tracker,
+ GFP_ATOMIC);
root->nodes++;
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
@@ -204,6 +219,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_lock(&root->lock);
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
+ btrfs_delayed_node_ref_tracker_free(node, &node->node_list_tracker);
refcount_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
@@ -214,22 +230,26 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
}
static struct btrfs_delayed_node *btrfs_first_delayed_node(
- struct btrfs_delayed_root *delayed_root)
+ struct btrfs_delayed_root *delayed_root,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
node = list_first_entry_or_null(&delayed_root->node_list,
struct btrfs_delayed_node, n_list);
- if (node)
+ if (node) {
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
+ }
spin_unlock(&delayed_root->lock);
return node;
}
static struct btrfs_delayed_node *btrfs_next_delayed_node(
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_root *delayed_root;
struct list_head *p;
@@ -249,6 +269,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
next = list_entry(p, struct btrfs_delayed_node, n_list);
refcount_inc(&next->refs);
+ btrfs_delayed_node_ref_tracker_alloc(next, tracker, GFP_ATOMIC);
out:
spin_unlock(&delayed_root->lock);
@@ -257,7 +278,7 @@ out:
static void __btrfs_release_delayed_node(
struct btrfs_delayed_node *delayed_node,
- int mod)
+ int mod, struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_root *delayed_root;
@@ -273,6 +294,7 @@ static void __btrfs_release_delayed_node(
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
mutex_unlock(&delayed_node->mutex);
+ btrfs_delayed_node_ref_tracker_free(delayed_node, tracker);
if (refcount_dec_and_test(&delayed_node->refs)) {
struct btrfs_root *root = delayed_node->root;
@@ -282,17 +304,20 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
+ btrfs_delayed_node_ref_tracker_dir_exit(delayed_node);
kmem_cache_free(delayed_node_cache, delayed_node);
}
}
-static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
- __btrfs_release_delayed_node(node, 0);
+ __btrfs_release_delayed_node(node, 0, tracker);
}
static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
- struct btrfs_delayed_root *delayed_root)
+ struct btrfs_delayed_root *delayed_root,
+ struct btrfs_ref_tracker *tracker)
{
struct btrfs_delayed_node *node;
@@ -302,6 +327,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
if (node) {
list_del_init(&node->p_list);
refcount_inc(&node->refs);
+ btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC);
}
spin_unlock(&delayed_root->lock);
@@ -309,9 +335,10 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
}
static inline void btrfs_release_prepared_delayed_node(
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
{
- __btrfs_release_delayed_node(node, 1);
+ __btrfs_release_delayed_node(node, 1, tracker);
}
static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
@@ -711,8 +738,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
u32 *ins_sizes;
int i = 0;
- ins_data = kmalloc(batch.nr * sizeof(u32) +
- batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(batch.nr,
+ sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data) {
ret = -ENOMEM;
goto out;
@@ -1011,7 +1038,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* transaction, because we could leave the inode with the
* improper counts behind.
*/
- if (ret != -ENOENT)
+ if (unlikely(ret != -ENOENT))
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1039,7 +1066,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto err_out;
}
@@ -1126,6 +1153,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret = 0;
@@ -1143,17 +1171,18 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
delayed_root = fs_info->delayed_root;
- curr_node = btrfs_first_delayed_node(delayed_root);
+ curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker);
while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
- curr_node = btrfs_next_delayed_node(curr_node);
+ prev_delayed_node_tracker = curr_delayed_node_tracker;
+ curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
/*
* See the comment below about releasing path before releasing
* node. If the commit of delayed items was successful the path
@@ -1161,7 +1190,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
* point to locked extent buffers (a leaf at the very least).
*/
ASSERT(path->nodes[0] == NULL);
- btrfs_release_delayed_node(prev_node);
+ btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
}
/*
@@ -1174,7 +1203,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
btrfs_free_path(path);
if (curr_node)
- btrfs_release_delayed_node(curr_node);
+ btrfs_release_delayed_node(curr_node, &curr_delayed_node_tracker);
trans->block_rsv = block_rsv;
return ret;
@@ -1193,7 +1222,9 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node =
+ btrfs_get_delayed_node(inode, &delayed_node_tracker);
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -1204,14 +1235,14 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
if (!delayed_node->count) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
mutex_unlock(&delayed_node->mutex);
path = btrfs_alloc_path();
if (!path) {
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -ENOMEM;
}
@@ -1220,7 +1251,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
trans->block_rsv = block_rsv;
return ret;
@@ -1230,18 +1261,20 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_trans_handle *trans;
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node;
struct btrfs_path *path;
struct btrfs_block_rsv *block_rsv;
int ret;
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return 0;
mutex_lock(&delayed_node->mutex);
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
mutex_unlock(&delayed_node->mutex);
@@ -1275,7 +1308,7 @@ trans_out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1289,7 +1322,8 @@ void btrfs_remove_delayed_node(struct btrfs_inode *inode)
return;
inode->delayed_node = NULL;
- btrfs_release_delayed_node(delayed_node);
+
+ btrfs_release_delayed_node(delayed_node, &delayed_node->inode_cache_tracker);
}
struct btrfs_async_delayed_work {
@@ -1305,6 +1339,7 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_delayed_node *delayed_node = NULL;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_root *root;
struct btrfs_block_rsv *block_rsv;
int total_done = 0;
@@ -1321,7 +1356,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
BTRFS_DELAYED_BACKGROUND / 2)
break;
- delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+ delayed_node = btrfs_first_prepared_delayed_node(delayed_root,
+ &delayed_node_tracker);
if (!delayed_node)
break;
@@ -1330,7 +1366,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_release_path(path);
- btrfs_release_prepared_delayed_node(delayed_node);
+ btrfs_release_prepared_delayed_node(delayed_node,
+ &delayed_node_tracker);
total_done++;
continue;
}
@@ -1345,7 +1382,8 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
btrfs_btree_balance_dirty_nodelay(root->fs_info);
btrfs_release_path(path);
- btrfs_release_prepared_delayed_node(delayed_node);
+ btrfs_release_prepared_delayed_node(delayed_node,
+ &delayed_node_tracker);
total_done++;
} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
@@ -1377,10 +1415,15 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *node;
- if (WARN_ON(node))
+ node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker);
+ if (WARN_ON(node)) {
+ btrfs_delayed_node_ref_tracker_free(node,
+ &delayed_node_tracker);
refcount_dec(&node->refs);
+ }
}
static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
@@ -1454,13 +1497,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
bool reserve_leaf_space;
u32 data_len;
int ret;
- delayed_node = btrfs_get_or_create_delayed_node(dir);
+ delayed_node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1536,7 +1580,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_unlock(&delayed_node->mutex);
release_node:
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1591,10 +1635,11 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, u64 index)
{
struct btrfs_delayed_node *node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_delayed_item *item;
int ret;
- node = btrfs_get_or_create_delayed_node(dir);
+ node = btrfs_get_or_create_delayed_node(dir, &delayed_node_tracker);
if (IS_ERR(node))
return PTR_ERR(node);
@@ -1635,14 +1680,16 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
mutex_unlock(&node->mutex);
end:
- btrfs_release_delayed_node(node);
+ btrfs_release_delayed_node(node, &delayed_node_tracker);
return ret;
}
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
{
- struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ struct btrfs_ref_tracker delayed_node_tracker;
+ struct btrfs_delayed_node *delayed_node;
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return -ENOENT;
@@ -1652,12 +1699,12 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
* is updated now. So we needn't lock the delayed node.
*/
if (!delayed_node->index_cnt) {
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -EINVAL;
}
inode->index_cnt = delayed_node->index_cnt;
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -1668,8 +1715,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
+ struct btrfs_ref_tracker delayed_node_tracker;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return false;
@@ -1704,6 +1752,7 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
* insert/delete delayed items in this period. So we also needn't
* requeue or dequeue this delayed node.
*/
+ btrfs_delayed_node_ref_tracker_free(delayed_node, &delayed_node_tracker);
refcount_dec(&delayed_node->refs);
return true;
@@ -1843,19 +1892,19 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
struct btrfs_inode_item *inode_item;
struct inode *vfs_inode = &inode->vfs_inode;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return -ENOENT;
mutex_lock(&delayed_node->mutex);
if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return -ENOENT;
}
@@ -1864,8 +1913,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
@@ -1895,7 +1942,7 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
inode->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -1904,9 +1951,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
int ret = 0;
- delayed_node = btrfs_get_or_create_delayed_node(inode);
+ delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1926,7 +1974,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
atomic_inc(&root->fs_info->delayed_root->items);
release_node:
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return ret;
}
@@ -1934,6 +1982,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
/*
* we don't do delayed inode updates during log recovery because it
@@ -1943,7 +1992,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return -EAGAIN;
- delayed_node = btrfs_get_or_create_delayed_node(inode);
+ delayed_node = btrfs_get_or_create_delayed_node(inode, &delayed_node_tracker);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
@@ -1970,7 +2019,7 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
atomic_inc(&fs_info->delayed_root->items);
release_node:
mutex_unlock(&delayed_node->mutex);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
return 0;
}
@@ -2014,19 +2063,21 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node;
+ struct btrfs_ref_tracker delayed_node_tracker;
- delayed_node = btrfs_get_delayed_node(inode);
+ delayed_node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!delayed_node)
return;
__btrfs_kill_delayed_node(delayed_node);
- btrfs_release_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node, &delayed_node_tracker);
}
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
+ struct btrfs_ref_tracker delayed_node_trackers[8];
while (1) {
struct btrfs_delayed_node *node;
@@ -2045,6 +2096,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
* about to be removed from the tree in the loop below
*/
if (refcount_inc_not_zero(&node->refs)) {
+ btrfs_delayed_node_ref_tracker_alloc(node,
+ &delayed_node_trackers[count],
+ GFP_ATOMIC);
delayed_nodes[count] = node;
count++;
}
@@ -2056,7 +2110,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
- btrfs_release_delayed_node(delayed_nodes[i]);
+ btrfs_release_delayed_node(delayed_nodes[i],
+ &delayed_node_trackers[i]);
+ btrfs_delayed_node_ref_tracker_dir_print(delayed_nodes[i]);
}
}
}
@@ -2064,14 +2120,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
{
struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker;
- curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
+ curr_node = btrfs_first_delayed_node(fs_info->delayed_root,
+ &curr_delayed_node_tracker);
while (curr_node) {
__btrfs_kill_delayed_node(curr_node);
prev_node = curr_node;
- curr_node = btrfs_next_delayed_node(curr_node);
- btrfs_release_delayed_node(prev_node);
+ prev_delayed_node_tracker = curr_delayed_node_tracker;
+ curr_node = btrfs_next_delayed_node(curr_node, &curr_delayed_node_tracker);
+ btrfs_release_delayed_node(prev_node, &prev_delayed_node_tracker);
}
}
@@ -2081,8 +2140,9 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
+ struct btrfs_ref_tracker delayed_node_tracker;
- node = btrfs_get_delayed_node(inode);
+ node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!node)
return;
@@ -2140,6 +2200,7 @@ void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
* delete delayed items.
*/
ASSERT(refcount_read(&node->refs) > 1);
+ btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
refcount_dec(&node->refs);
}
@@ -2150,8 +2211,9 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
struct btrfs_delayed_item *next;
+ struct btrfs_ref_tracker delayed_node_tracker;
- node = btrfs_get_delayed_node(inode);
+ node = btrfs_get_delayed_node(inode, &delayed_node_tracker);
if (!node)
return;
@@ -2183,5 +2245,6 @@ void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
* delete delayed items.
*/
ASSERT(refcount_read(&node->refs) > 1);
+ btrfs_delayed_node_ref_tracker_free(node, &delayed_node_tracker);
refcount_dec(&node->refs);
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index e6e763ad2d42..0d949edc0caf 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/ref_tracker.h>
#include "ctree.h"
struct btrfs_disk_key;
@@ -44,6 +45,22 @@ struct btrfs_delayed_root {
wait_queue_head_t wait;
};
+struct btrfs_ref_tracker_dir {
+#ifdef CONFIG_BTRFS_DEBUG
+ struct ref_tracker_dir dir;
+#else
+ struct {} tracker;
+#endif
+};
+
+struct btrfs_ref_tracker {
+#ifdef CONFIG_BTRFS_DEBUG
+ struct ref_tracker *tracker;
+#else
+ struct {} tracker;
+#endif
+};
+
#define BTRFS_DELAYED_NODE_IN_LIST 0
#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
#define BTRFS_DELAYED_NODE_DEL_IREF 2
@@ -78,6 +95,12 @@ struct btrfs_delayed_node {
* actual number of leaves we end up using. Protected by @mutex.
*/
u32 index_item_leaves;
+ /* Track all references to this delayed node. */
+ struct btrfs_ref_tracker_dir ref_dir;
+ /* Track delayed node reference stored in node list. */
+ struct btrfs_ref_tracker node_list_tracker;
+ /* Track delayed node reference stored in inode cache. */
+ struct btrfs_ref_tracker inode_cache_tracker;
};
struct btrfs_delayed_item {
@@ -169,4 +192,74 @@ void __cold btrfs_delayed_inode_exit(void);
/* for debugging */
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
+#define BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT 16
+#define BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT 16
+
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_init(&node->ref_dir.dir,
+ BTRFS_DELAYED_NODE_REF_TRACKER_QUARANTINE_COUNT,
+ "delayed_node");
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_exit(&node->ref_dir.dir);
+}
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return;
+
+ ref_tracker_dir_print(&node->ref_dir.dir,
+ BTRFS_DELAYED_NODE_REF_TRACKER_DISPLAY_LIMIT);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker,
+ gfp_t gfp)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return 0;
+
+ return ref_tracker_alloc(&node->ref_dir.dir, &tracker->tracker, gfp);
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
+{
+ if (!btrfs_test_opt(node->root->fs_info, REF_TRACKER))
+ return 0;
+
+ return ref_tracker_free(&node->ref_dir.dir, &tracker->tracker);
+}
+#else
+static inline void btrfs_delayed_node_ref_tracker_dir_init(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_exit(struct btrfs_delayed_node *node) { }
+
+static inline void btrfs_delayed_node_ref_tracker_dir_print(struct btrfs_delayed_node *node) { }
+
+static inline int btrfs_delayed_node_ref_tracker_alloc(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker,
+ gfp_t gfp)
+{
+ return 0;
+}
+
+static inline int btrfs_delayed_node_ref_tracker_free(struct btrfs_delayed_node *node,
+ struct btrfs_ref_tracker *tracker)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ca382c5b186f..481802efaa14 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -895,7 +895,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
/*
- * Initialize the structure which represents a modification to a an extent.
+ * Initialize the structure which represents a modification to an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
@@ -952,7 +952,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
bool skip_qgroup)
{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
@@ -969,7 +969,7 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
u64 mod_root, bool skip_qgroup)
{
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
@@ -1251,7 +1251,6 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
{
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
- bool testing = btrfs_is_testing(fs_info);
spin_lock(&delayed_refs->lock);
while (true) {
@@ -1281,7 +1280,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (!testing && pin_bytes) {
+ if (!btrfs_is_testing(fs_info) && pin_bytes) {
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@@ -1312,14 +1311,14 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
- if (!testing)
+ if (!btrfs_is_testing(fs_info))
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
- if (!testing)
+ if (!btrfs_is_testing(fs_info))
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 552ec4fa645d..5ce940532144 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -276,10 +276,6 @@ struct btrfs_ref {
*/
bool skip_qgroup;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- /* Through which root is this modification. */
- u64 real_root;
-#endif
u64 bytenr;
u64 num_bytes;
u64 owning_root;
@@ -296,6 +292,11 @@ struct btrfs_ref {
struct btrfs_data_ref data_ref;
struct btrfs_tree_ref tree_ref;
};
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /* Through which root is this modification. */
+ u64 real_root;
+#endif
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 4675bcd5f92e..a4eaef60549e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -98,7 +98,7 @@ no_valid_dev_replace_entry_found:
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
return -EUCLEAN;
@@ -158,7 +158,7 @@ no_valid_dev_replace_entry_found:
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
- if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ if (unlikely(btrfs_find_device(fs_info->fs_devices, &args))) {
btrfs_err(fs_info,
"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
@@ -177,8 +177,7 @@ no_valid_dev_replace_entry_found:
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
*/
- if (!dev_replace->srcdev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely(!dev_replace->srcdev && !btrfs_test_opt(fs_info, DEGRADED))) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -186,8 +185,7 @@ no_valid_dev_replace_entry_found:
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
src_devid);
}
- if (!dev_replace->tgtdev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely(!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED))) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -637,7 +635,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state");
+ DEBUG_WARN("unexpected STARTED or SUSPENDED dev-replace state");
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index fe9a4bd7e6e6..802d4dbe5b38 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
+ /*
+ * For bs > ps support, we heavily rely on large folios to make sure no
+ * block will cross large folio boundaries.
+ *
+ * But memory provided by direct IO is only virtually contiguous, not
+ * physically contiguous, and will break the btrfs' large folio requirement.
+ *
+ * So for bs > ps support, all direct IOs should fallback to buffered ones.
+ */
+ if (fs_info->sectorsize > PAGE_SIZE)
+ return -EINVAL;
+
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 70fc4e7cc5a0..9247a58894de 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -116,7 +116,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
-int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
+int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic)
{
if (!extent_buffer_uptodate(eb))
return 0;
@@ -370,21 +370,21 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
found_start = btrfs_header_bytenr(eb);
- if (found_start != eb->start) {
+ if (unlikely(found_start != eb->start)) {
btrfs_err_rl(fs_info,
"bad tree block start, mirror %u want %llu have %llu",
eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
- if (check_tree_block_fsid(eb)) {
+ if (unlikely(check_tree_block_fsid(eb))) {
btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
- if (found_level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
btrfs_err(fs_info,
"bad tree block level, mirror %u level %d on logical %llu",
eb->read_mirror, btrfs_header_level(eb), eb->start);
@@ -404,13 +404,13 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb),
ignore_csum ? ", ignored" : "");
- if (!ignore_csum) {
+ if (unlikely(!ignore_csum)) {
ret = -EUCLEAN;
goto out;
}
}
- if (found_level != check->level) {
+ if (unlikely(found_level != check->level)) {
btrfs_err(fs_info,
"level verify failed on logical %llu mirror %u wanted %u found %u",
eb->start, eb->read_mirror, check->level, found_level);
@@ -639,7 +639,6 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
u64 objectid, gfp_t flags)
{
struct btrfs_root *root;
- bool dummy = btrfs_is_testing(fs_info);
root = kzalloc(sizeof(*root), flags);
if (!root)
@@ -696,7 +695,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
root->log_transid_committed = -1;
btrfs_set_root_last_log_commit(root, 0);
root->anon_dev = 0;
- if (!dummy) {
+ if (!btrfs_is_testing(fs_info)) {
btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES);
btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
@@ -1047,7 +1046,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
root->node = NULL;
goto fail;
}
- if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) {
ret = -EIO;
goto fail;
}
@@ -1056,10 +1055,10 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
- if (!btrfs_is_testing(fs_info) &&
- btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
- btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
- btrfs_root_id(root) != btrfs_header_owner(root->node)) {
+ if (unlikely(!btrfs_is_testing(fs_info) &&
+ btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
+ btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_root_id(root) != btrfs_header_owner(root->node))) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
btrfs_root_id(root), root->node->start,
@@ -1248,6 +1247,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
if (fs_info->fs_devices)
btrfs_close_devices(fs_info->fs_devices);
+ btrfs_free_compress_wsm(fs_info);
percpu_counter_destroy(&fs_info->stats_read_blocks);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
- unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
+ unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
@@ -2058,7 +2058,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
int level = btrfs_super_log_root_level(disk_super);
- if (fs_devices->rw_devices == 0) {
+ if (unlikely(fs_devices->rw_devices == 0)) {
btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2079,7 +2079,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
btrfs_put_root(log_tree_root);
return ret;
}
- if (!extent_buffer_uptodate(log_tree_root->node)) {
+ if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
@@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
+ btrfs_put_root(log_tree_root);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
- btrfs_put_root(log_tree_root);
return ret;
}
@@ -2324,7 +2324,7 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
const u32 sectorsize = btrfs_super_sectorsize(sb);
u32 sys_array_size = btrfs_super_sys_array_size(sb);
- if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) {
btrfs_err(fs_info, "system chunk array too big %u > %u",
sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
return -EUCLEAN;
@@ -2342,12 +2342,12 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
len = sizeof(*disk_key);
- if (cur + len > sys_array_size)
+ if (unlikely(cur + len > sys_array_size))
goto short_read;
cur += len;
btrfs_disk_key_to_cpu(&key, disk_key);
- if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+ if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) {
btrfs_err(fs_info,
"unexpected item type %u in sys_array at offset %u",
key.type, cur);
@@ -2355,10 +2355,10 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
}
chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
- if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
+ if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size))
goto short_read;
type = btrfs_stack_chunk_type(chunk);
- if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) {
btrfs_err(fs_info,
"invalid chunk type %llu in sys_array at offset %u",
type, cur);
@@ -2438,21 +2438,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- /*
- * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
- *
- * For 4K page sized systems with non-debug builds, all 3 matches (4K).
- * For 4K page sized systems with debug builds, there are two block sizes
- * supported. (4K and 2K)
- *
- * We can support 16K sectorsize with 64K page size without problem,
- * but such sectorsize/pagesize combination doesn't make much sense.
- * 4K will be our future standard, PAGE_SIZE is supported from the very
- * beginning.
- */
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
- sectorsize != PAGE_SIZE &&
- sectorsize != BTRFS_MIN_BLOCKSIZE)) {
+ if (!btrfs_supported_blocksize(sectorsize)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2619,13 +2605,13 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
- if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
+ if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) {
ret = -EUCLEAN;
btrfs_err(fs_info, "invalid csum type, has %u want %u",
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
goto out;
}
- if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"invalid incompat flags, has 0x%llx valid mask 0x%llx",
@@ -2655,7 +2641,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev
root->node = NULL;
return ret;
}
- if (!extent_buffer_uptodate(root->node)) {
+ if (unlikely(!extent_buffer_uptodate(root->node))) {
free_extent_buffer(root->node);
root->node = NULL;
return -EIO;
@@ -3256,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
}
/*
- * Subpage runtime limitation on v1 cache.
+ * Subpage/bs > ps runtime limitation on v1 cache.
*
- * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * V1 space cache still has some hard coded PAGE_SIZE usage, while
* we're already defaulting to v2 cache, no need to bother v1 as it's
* going to be deprecated anyway.
*/
- if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
btrfs_warn(fs_info,
"v1 space cache is not supported for page size %lu with sectorsize %u",
PAGE_SIZE, fs_info->sectorsize);
return -EINVAL;
}
+ if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) {
+ btrfs_err(fs_info,
+ "RAID56 is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
/* This can be called by remount, we need to protect the super block. */
spin_lock(&fs_info->super_lock);
@@ -3396,10 +3388,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize_bits = ilog2(nodesize);
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
+ fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
+ if (fs_info->sectorsize > PAGE_SIZE)
+ btrfs_warn(fs_info,
+ "support for block size %u with page size %zu is experimental, some features may be missing",
+ fs_info->sectorsize, PAGE_SIZE);
/*
* Handle the space caching options appropriately now that we have the
* super block loaded and validated.
@@ -3421,6 +3419,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+ ret = btrfs_alloc_compress_wsm(fs_info);
+ if (ret)
+ goto fail_sb_buffer;
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -3468,7 +3469,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
- if (!fs_devices->latest_dev->bdev) {
+ if (unlikely(!fs_devices->latest_dev->bdev)) {
btrfs_err(fs_info, "failed to read devices");
ret = -EIO;
goto fail_tree_roots;
@@ -3962,7 +3963,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
* Checks last_flush_error of disks in order to determine the device
* state.
*/
- if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
+ if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
return -EIO;
return 0;
@@ -4064,7 +4065,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
ret = btrfs_validate_write_super(fs_info, sb);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_handle_fs_error(fs_info, -EUCLEAN,
"unexpected superblock corruption detected");
@@ -4075,7 +4076,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
if (ret)
total_errors++;
}
- if (total_errors > max_errors) {
+ if (unlikely(total_errors > max_errors)) {
btrfs_err(fs_info, "%d errors while writing supers",
total_errors);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4100,7 +4101,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
total_errors++;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- if (total_errors > max_errors) {
+ if (unlikely(total_errors > max_errors)) {
btrfs_handle_fs_error(fs_info, -EIO,
"%d errors while writing supers",
total_errors);
@@ -4880,7 +4881,7 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 864a55a96226..57920f2c6fe4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -106,8 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
- int atomic);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
const struct btrfs_tree_parent_check *check);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 7fc8a3200b40..d062ac521051 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -174,7 +174,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset of -1 found, there would have to exist an
* inode with such number or a root with such id.
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 66361325f6dc..bb2ca1c9c7b0 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1237,7 +1237,7 @@ hit_next:
state = next_search_state(inserted_state, end);
/*
* If there's a next state, whether contiguous or not, we don't
- * need to unlock and start search agian. If it's not contiguous
+ * need to unlock and start search again. If it's not contiguous
* we will end up here and try to allocate a prealloc state and insert.
*/
if (state)
@@ -1664,7 +1664,7 @@ out:
*/
u64 btrfs_count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig,
+ u32 bits, bool contig,
struct extent_state **cached_state)
{
struct extent_state *state = NULL;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 36facca37973..6f07b965e8da 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -163,7 +163,7 @@ void __cold btrfs_extent_state_free_cachep(void);
u64 btrfs_count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, u32 bits, int contig,
+ u64 max_bytes, u32 bits, bool contig,
struct extent_state **cached_state);
void btrfs_free_extent_state(struct extent_state *state);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 97d517cdf2df..dc4ca98c3780 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -325,7 +325,7 @@ search_again:
/*
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
- * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
+ * is_data == BTRFS_REF_TYPE_DATA, data type is required,
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
@@ -879,7 +879,7 @@ again:
ptr += btrfs_extent_inline_ref_size(type);
continue;
}
- if (type == BTRFS_REF_TYPE_INVALID) {
+ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) {
ret = -EUCLEAN;
goto out;
}
@@ -1210,7 +1210,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
* We're adding refs to a tree block we already own, this
* should not happen at all.
*/
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(owner < BTRFS_FIRST_FREE_OBJECTID)) {
btrfs_print_leaf(path->nodes[0]);
btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
@@ -2157,7 +2157,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
ret = __btrfs_run_delayed_refs(trans, min_bytes);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -2355,7 +2355,7 @@ static noinline int check_committed_ref(struct btrfs_inode *inode,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -2457,7 +2457,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc)
+ bool full_backref, bool inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 parent;
@@ -2543,15 +2543,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, bool full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, true);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, bool full_backref)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, false);
}
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
@@ -2760,7 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
- if (cache == NULL) {
+ if (unlikely(cache == NULL)) {
/* Logic error, something removed the block group. */
ret = -EUCLEAN;
goto out;
@@ -2982,26 +2982,26 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
csum_root = btrfs_csum_root(trans->fs_info, bytenr);
ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
}
ret = btrfs_record_squota_delta(trans->fs_info, delta);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3115,7 +3115,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- if (!is_data && refs_to_drop != 1) {
+ if (unlikely(!is_data && refs_to_drop != 1)) {
btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
node->bytenr, refs_to_drop);
@@ -3162,7 +3162,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
if (!found_extent) {
- if (iref) {
+ if (unlikely(iref)) {
abort_and_dump(trans, path,
"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
path->slots[0]);
@@ -3172,7 +3172,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop, is_data);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3221,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"umm, got %d back from search, was looking for %llu, slot %d",
ret, bytenr, path->slots[0]);
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3254,7 +3254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ if (unlikely(item_size < sizeof(*ei) + sizeof(*bi))) {
abort_and_dump(trans, path,
"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
@@ -3268,7 +3268,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
refs = btrfs_extent_refs(leaf, ei);
- if (refs < refs_to_drop) {
+ if (unlikely(refs < refs_to_drop)) {
abort_and_dump(trans, path,
"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
refs_to_drop, refs, bytenr, path->slots[0]);
@@ -3285,7 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- if (!found_extent) {
+ if (unlikely(!found_extent)) {
abort_and_dump(trans, path,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
path->slots[0]);
@@ -3298,7 +3298,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop, is_data);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3314,8 +3314,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
/* In this branch refs == 1 */
if (found_extent) {
- if (is_data && refs_to_drop !=
- extent_data_ref_count(path, iref)) {
+ if (unlikely(is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref))) {
abort_and_dump(trans, path,
"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
extent_data_ref_count(path, iref),
@@ -3324,7 +3324,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
if (iref) {
- if (path->slots[0] != extent_slot) {
+ if (unlikely(path->slots[0] != extent_slot)) {
abort_and_dump(trans, path,
"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
key.objectid, key.type,
@@ -3339,7 +3339,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* | extent_slot ||extent_slot + 1|
* [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
*/
- if (path->slots[0] != extent_slot + 1) {
+ if (unlikely(path->slots[0] != extent_slot + 1)) {
abort_and_dump(trans, path,
"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
path->slots[0]);
@@ -3363,7 +3363,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
}
static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info)
{
if (ffe_ctl->for_treelog) {
spin_lock(&fs_info->treelog_bg_lock);
@@ -4315,12 +4316,13 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
/*
- * No lock is OK here because avail is monotinically
+ * No lock is OK here because avail is monotonically
* decreasing, and this is just a hint.
*/
u64 avail = block_group->zone_capacity - block_group->alloc_offset;
if (block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
avail >= ffe_ctl->num_bytes) {
ffe_ctl->hint_byte = block_group->start;
break;
@@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- return prepare_allocation_zoned(fs_info, ffe_ctl);
+ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info);
default:
BUG();
}
@@ -5061,7 +5063,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
- if (check_eb_lock_owner(buf)) {
+ if (unlikely(check_eb_lock_owner(buf))) {
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -5470,17 +5472,17 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
if (!(wc->flags[level] & flag)) {
ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -5582,7 +5584,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
- if (btrfs_buffer_uptodate(next, generation, 0))
+ if (btrfs_buffer_uptodate(next, generation, false))
return 0;
check.level = level - 1;
@@ -5611,7 +5613,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
* If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
*
* If we are DROP_REFERENCE this will figure out if we need to drop our current
- * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * reference, skipping it if we dropped it from a previous uncompleted drop, or
* dropping it if we still have a reference to it.
*/
static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -5636,7 +5638,7 @@ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_r
ref.parent = path->nodes[level]->start;
} else {
ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
- if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ if (unlikely(btrfs_root_id(root) != btrfs_header_owner(path->nodes[level]))) {
btrfs_err(root->fs_info, "mismatched block owner");
return -EIO;
}
@@ -5758,7 +5760,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/*
* We have to walk down into this node, and if we're currently at the
- * DROP_REFERNCE stage and this block is shared then we need to switch
+ * DROP_REFERENCE stage and this block is shared then we need to switch
* to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
*/
if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
@@ -5772,7 +5774,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
level--;
ASSERT(level == btrfs_header_level(next));
- if (level != btrfs_header_level(next)) {
+ if (unlikely(level != btrfs_header_level(next))) {
btrfs_err(root->fs_info, "mismatched level");
ret = -EIO;
goto out_unlock;
@@ -5883,7 +5885,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
}
} else {
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -5908,13 +5910,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else if (btrfs_root_id(root) != btrfs_header_owner(eb))
+ else if (unlikely(btrfs_root_id(root) != btrfs_header_owner(eb)))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else if (btrfs_root_id(root) !=
- btrfs_header_owner(path->nodes[level + 1]))
+ else if (unlikely(btrfs_root_id(root) !=
+ btrfs_header_owner(path->nodes[level + 1])))
goto owner_mismatch;
}
@@ -6049,9 +6051,9 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*
- * If called with for_reloc == 0, may exit early with -EAGAIN
+ * If called with for_reloc set, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc)
{
const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6178,13 +6180,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
while (1) {
ret = walk_down_tree(trans, root, path, wc);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -6211,7 +6213,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -6247,7 +6249,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -6255,7 +6257,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
} else if (ret > 0) {
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 82d3a82dc712..e970ac42a871 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -140,9 +140,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, bool full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, bool full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
@@ -155,8 +155,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
- int for_reloc);
+int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c953297aa89a..c123a3ef154a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -101,6 +101,26 @@ struct btrfs_bio_ctrl {
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
+ /*
+ * For data read bios, we attempt to optimize csum lookups if the extent
+ * generation is older than the current one. To make this possible, we
+ * need to track the maximum generation of an extent in a bio_ctrl to
+ * make the decision when submitting the bio.
+ *
+ * The pattern between do_readpage(), submit_one_bio() and
+ * submit_extent_folio() is quite subtle, so tracking this is tricky.
+ *
+ * As we process extent E, we might submit a bio with existing built up
+ * extents before adding E to a new bio, or we might just add E to the
+ * bio. As a result, E's generation could apply to the current bio or
+ * to the next one, so we need to be careful to update the bio_ctrl's
+ * generation with E's only when we are sure E is added to bio_ctrl->bbio
+ * in submit_extent_folio().
+ *
+ * See the comment in btrfs_lookup_bio_sums() for more detail on the
+ * need for this optimization.
+ */
+ u64 generation;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;
@@ -111,8 +131,46 @@ struct btrfs_bio_ctrl {
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
+/*
+ * Helper to set the csum search commit root option for a bio_ctrl's bbio
+ * before submitting the bio.
+ *
+ * Only for use by submit_one_bio().
+ */
+static void bio_set_csum_search_commit_root(struct btrfs_bio_ctrl *bio_ctrl)
+{
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
+
+ ASSERT(bbio);
+
+ if (!(btrfs_op(&bbio->bio) == BTRFS_MAP_READ && is_data_inode(bbio->inode)))
+ return;
+
+ bio_ctrl->bbio->csum_search_commit_root =
+ (bio_ctrl->generation &&
+ bio_ctrl->generation < btrfs_get_fs_generation(bbio->inode->root->fs_info));
+}
+
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_bio *bbio = bio_ctrl->bbio;
@@ -123,6 +181,8 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* Caller should ensure the bio has at least some range added */
ASSERT(bbio->bio.bi_iter.bi_size);
+ bio_set_csum_search_commit_root(bio_ctrl);
+
if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
btrfs_submit_compressed_read(bbio);
@@ -131,6 +191,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
+ /*
+ * We used the generation to decide whether to lookup csums in the
+ * commit_root or not when we called bio_set_csum_search_commit_root()
+ * above. Now, reset the generation for the next bio.
+ */
+ bio_ctrl->generation = 0;
}
/*
@@ -327,6 +393,13 @@ again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
+
+ /*
+ * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
+ * return early without handling any dirty ranges.
+ */
+ ASSERT(max_bytes >= fs_info->sectorsize);
+
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
@@ -352,18 +425,19 @@ again:
if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
- /* step two, lock all the folioss after the folios that has start */
+ /* step two, lock all the folios after the folios that has start */
ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the folios are gone, lets avoid looping by
- * shortening the size of the delalloc range we're searching
+ /*
+ * Some of the folios are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching.
*/
btrfs_free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_SIZE;
+ max_bytes = fs_info->sectorsize;
loops = 1;
goto again;
} else {
@@ -552,6 +626,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Populate every free slot in a provided array with folios using GFP_NOFS.
*
* @nr_folios: number of folios to allocate
+ * @order: the order of the folios to be allocated
* @folio_array: the array to fill with folios; any existing non-NULL entries in
* the array will be skipped
*
@@ -559,12 +634,13 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* -ENOMEM otherwise, the partially allocated folios would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+ struct folio **folio_array)
{
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
continue;
- folio_array[i] = folio_alloc(GFP_NOFS, 0);
+ folio_array[i] = folio_alloc(GFP_NOFS, order);
if (!folio_array[i])
goto error;
}
@@ -573,6 +649,7 @@ error:
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
folio_put(folio_array[i]);
+ folio_array[i] = NULL;
}
return -ENOMEM;
}
@@ -701,15 +778,18 @@ static void alloc_new_bio(struct btrfs_inode *inode,
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
+ * @read_em_generation: generation of the extent_map we are submitting
+ * (only used for read)
*
* The will either add the page into the existing @bio_ctrl->bbio, or allocate a
* new one in @bio_ctrl->bbio.
- * The mirror number for this IO should already be initizlied in
+ * The mirror number for this IO should already be initialized in
* @bio_ctrl->mirror_num.
*/
static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
u64 disk_bytenr, struct folio *folio,
- size_t size, unsigned long pg_offset)
+ size_t size, unsigned long pg_offset,
+ u64 read_em_generation)
{
struct btrfs_inode *inode = folio_to_inode(folio);
loff_t file_offset = folio_pos(folio) + pg_offset;
@@ -740,6 +820,11 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
submit_one_bio(bio_ctrl);
continue;
}
+ /*
+ * Now that the folio is definitely added to the bio, include its
+ * generation in the max generation calculation.
+ */
+ bio_ctrl->generation = max(bio_ctrl->generation, read_em_generation);
bio_ctrl->next_file_offset += len;
if (bio_ctrl->wbc)
@@ -909,7 +994,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -942,6 +1027,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
bool force_bio_submit = false;
u64 disk_bytenr;
u64 block_start;
+ u64 em_gen;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
@@ -1019,13 +1105,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
+ em_gen = em->generation;
btrfs_free_extent_map(em);
em = NULL;
@@ -1049,7 +1135,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
- pg_offset);
+ pg_offset, em_gen);
}
return 0;
}
@@ -1238,12 +1324,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -1580,7 +1669,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(folio_test_writeback(folio));
submit_extent_folio(bio_ctrl, disk_bytenr, folio,
- sectorsize, filepos - folio_pos(folio));
+ sectorsize, filepos - folio_pos(folio), 0);
return 0;
}
@@ -1601,7 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
- bool error = false;
+ int found_error = 0;
const u64 folio_start = folio_pos(folio);
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
@@ -1665,7 +1754,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*/
btrfs_mark_ordered_io_finished(inode, folio, cur,
fs_info->sectorsize, false);
- error = true;
+ if (!found_error)
+ found_error = ret;
continue;
}
submitted_io = true;
@@ -1682,11 +1772,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* If we hit any error, the corresponding sector will have its dirty
* flag cleared and writeback finished, thus no need to handle the error case.
*/
- if (!submitted_io && !error) {
+ if (!submitted_io && !found_error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
- return ret;
+ return found_error;
}
/*
@@ -2147,7 +2237,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
* @fs_info: The fs_info for this file system.
* @start: The offset of the range to start waiting on writeback.
* @end: The end of the range, inclusive. This is meant to be used in
- * conjuction with wait_marked_extents, so this will usually be
+ * conjunction with wait_marked_extents, so this will usually be
* the_next_eb->start - 1.
*/
void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
@@ -2417,7 +2507,7 @@ retry:
* In above case, [32K, 96K) is asynchronously submitted
* for compression, and [124K, 128K) needs to be written back.
*
- * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+ * If we didn't wait writeback for page 64K, [128K, 128K)
* won't be submitted as the page still has writeback flag
* and will be skipped in the next check.
*
@@ -2583,7 +2673,8 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = {
.opf = REQ_OP_READ | REQ_RAHEAD,
- .ractl = rac
+ .ractl = rac,
+ .last_em_start = U64_MAX,
};
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2591,12 +2682,11 @@ void btrfs_readahead(struct readahead_control *rac)
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
@@ -2901,7 +2991,7 @@ static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
{
const int num_folios = num_extent_folios(eb);
- /* We canont use num_extent_folios() as loop bound as eb->folios changes. */
+ /* We cannot use num_extent_folios() as loop bound as eb->folios changes. */
for (int i = 0; i < num_folios; i++) {
ASSERT(eb->folios[i]);
detach_extent_buffer_folio(eb, eb->folios[i]);
@@ -3148,29 +3238,30 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
*/
static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ const u32 nodesize = fs_info->nodesize;
+
+ if (unlikely(!IS_ALIGNED(start, fs_info->sectorsize))) {
btrfs_err(fs_info, "bad tree block start %llu", start);
return true;
}
- if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) {
+ if (unlikely(nodesize < PAGE_SIZE && !IS_ALIGNED(start, nodesize))) {
btrfs_err(fs_info,
"tree block is not nodesize aligned, start %llu nodesize %u",
- start, fs_info->nodesize);
+ start, nodesize);
return true;
}
- if (fs_info->nodesize >= PAGE_SIZE &&
- !PAGE_ALIGNED(start)) {
+ if (unlikely(nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start))) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
- start, fs_info->nodesize);
+ start, nodesize);
return true;
}
- if (!IS_ALIGNED(start, fs_info->nodesize) &&
- !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+ if (unlikely(!IS_ALIGNED(start, nodesize) &&
+ !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags))) {
btrfs_warn(fs_info,
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
- start, fs_info->nodesize);
+ start, nodesize);
}
return false;
}
@@ -3789,7 +3880,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
return ret;
wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)))
return -EIO;
return 0;
}
@@ -4465,7 +4556,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb))
return;
- if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ if (btrfs_buffer_uptodate(eb, gen, true)) {
free_extent_buffer(eb);
return;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 61130786b9a3..5fcbfe44218c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -366,7 +366,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
bool nofail);
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
+int btrfs_alloc_folio_array(unsigned int nr_folios, unsigned int order,
+ struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 57f52585a6dd..7e38c23a0c1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -460,7 +460,7 @@ void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
static inline void setup_extent_mapping(struct btrfs_inode *inode,
struct extent_map *em,
- int modified)
+ bool modified)
{
refcount_inc(&em->refs);
@@ -486,7 +486,7 @@ static inline void setup_extent_mapping(struct btrfs_inode *inode,
* taken, or a reference dropped if the merge attempt was successful.
*/
static int add_extent_mapping(struct btrfs_inode *inode,
- struct extent_map *em, int modified)
+ struct extent_map *em, bool modified)
{
struct extent_map_tree *tree = &inode->extent_tree;
struct btrfs_root *root = inode->root;
@@ -509,7 +509,7 @@ static int add_extent_mapping(struct btrfs_inode *inode,
}
static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len, int strict)
+ u64 start, u64 len, bool strict)
{
struct extent_map *em;
struct rb_node *rb_node;
@@ -548,7 +548,7 @@ static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len)
{
- return lookup_extent_mapping(tree, start, len, 1);
+ return lookup_extent_mapping(tree, start, len, true);
}
/*
@@ -566,7 +566,7 @@ struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len)
{
- return lookup_extent_mapping(tree, start, len, 0);
+ return lookup_extent_mapping(tree, start, len, false);
}
/*
@@ -594,7 +594,7 @@ void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *e
static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *cur,
struct extent_map *new,
- int modified)
+ bool modified)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -670,7 +670,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
em->len = end - start;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
em->offset += start_diff;
- return add_extent_mapping(inode, em, 0);
+ return add_extent_mapping(inode, em, false);
}
/*
@@ -707,7 +707,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
if (em->disk_bytenr == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
- ret = add_extent_mapping(inode, em, 0);
+ ret = add_extent_mapping(inode, em, false);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
@@ -1057,7 +1057,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr
btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
em = btrfs_lookup_extent_mapping(em_tree, start, len);
- if (!em) {
+ if (unlikely(!em)) {
ret = -EIO;
goto out_unlock;
}
@@ -1082,7 +1082,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr
split_pre->flags = flags;
split_pre->generation = em->generation;
- replace_extent_mapping(inode, em, split_pre, 1);
+ replace_extent_mapping(inode, em, split_pre, true);
/*
* Now we only have an extent_map at:
@@ -1098,7 +1098,7 @@ int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pr
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
- add_extent_mapping(inode, split_mid, 1);
+ add_extent_mapping(inode, split_mid, true);
/* Once for us */
btrfs_free_extent_map(em);
@@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
return;
- queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+ queue_work(system_dfl_wq, &fs_info->em_shrinker_work);
}
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 7935586a9dbd..f2eaaef8422b 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -153,7 +153,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
if (cache_end > offset) {
if (offset == cache->offset) {
/*
- * We cached a dealloc range (found in the io tree) for
+ * We cached a delalloc range (found in the io tree) for
* a hole or prealloc extent and we have now found a
* file extent item for the same offset. What we have
* now is more recent and up to date, so discard what
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c09fbc257634..a42e6d54e7cd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -397,6 +397,36 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
path->skip_locking = 1;
}
+ /*
+ * If we are searching for a csum of an extent from a past
+ * transaction, we can search in the commit root and reduce
+ * lock contention on the csum tree extent buffers.
+ *
+ * This is important because that lock is an rwsem which gets
+ * pretty heavy write load under memory pressure and sustained
+ * csum overwrites, unlike the commit_root_sem. (Memory pressure
+ * makes us writeback the nodes multiple times per transaction,
+ * which makes us cow them each time, taking the write lock.)
+ *
+ * Due to how rwsem is implemented, there is a possible
+ * priority inversion where the readers holding the lock don't
+ * get scheduled (say they're in a cgroup stuck in heavy reclaim)
+ * which then blocks writers, including transaction commit. By
+ * using a semaphore with fewer writers (only a commit switching
+ * the roots), we make this issue less likely.
+ *
+ * Note that we don't rely on btrfs_search_slot to lock the
+ * commit root csum. We call search_slot multiple times, which would
+ * create a potential race where a commit comes in between searches
+ * while we are not holding the commit_root_sem, and we get csums
+ * from across transactions.
+ */
+ if (bbio->csum_search_commit_root) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ down_read(&fs_info->commit_root_sem);
+ }
+
while (bio_offset < orig_len) {
int count;
u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
@@ -442,6 +472,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
bio_offset += count * sectorsize;
}
+ if (bbio->csum_search_commit_root)
+ up_read(&fs_info->commit_root_sem);
return ret;
}
@@ -743,12 +775,10 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio)
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct bio *bio = &bbio->bio;
struct btrfs_ordered_sum *sums;
- char *data;
- struct bvec_iter iter;
- struct bio_vec bvec;
+ struct bvec_iter iter = bio->bi_iter;
+ phys_addr_t paddr;
+ const u32 blocksize = fs_info->sectorsize;
int index;
- unsigned int blockcount;
- int i;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
@@ -767,21 +797,9 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio)
shash->tfm = fs_info->csum_shash;
- bio_for_each_segment(bvec, bio, iter) {
- blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
- bvec.bv_len + fs_info->sectorsize
- - 1);
-
- for (i = 0; i < blockcount; i++) {
- data = bvec_kmap_local(&bvec);
- crypto_shash_digest(shash,
- data + (i * fs_info->sectorsize),
- fs_info->sectorsize,
- sums->sums + index);
- kunmap_local(data);
- index += fs_info->csum_size;
- }
-
+ btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) {
+ btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index);
+ index += fs_info->csum_size;
}
bbio->sums = sums;
@@ -993,7 +1011,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
- if (ret && ret != -EAGAIN) {
+ if (unlikely(ret && ret != -EAGAIN)) {
btrfs_abort_transaction(trans, ret);
break;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 204674934795..7efd1f8a1912 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -327,7 +327,7 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -426,7 +426,7 @@ delete_extent_item:
key.offset - extent_offset,
0, false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -443,7 +443,7 @@ delete_extent_item:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -587,21 +587,20 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
+ if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
+ if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (key.offset > start || extent_end < end) {
+ if (unlikely(key.offset > start || extent_end < end)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -676,7 +675,7 @@ again:
btrfs_release_path(path);
goto again;
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -704,7 +703,7 @@ again:
ref.ref_root = btrfs_root_id(root);
btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -712,7 +711,7 @@ again:
if (split == start) {
key.offset = start;
} else {
- if (start != key.offset) {
+ if (unlikely(start != key.offset)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -744,7 +743,7 @@ again:
del_slot = path->slots[0] + 1;
del_nr++;
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -762,7 +761,7 @@ again:
del_slot = path->slots[0];
del_nr++;
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -783,7 +782,7 @@ again:
extent_end - key.offset);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -815,7 +814,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
if (ret)
return ret;
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
return -EIO;
}
@@ -970,7 +969,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* Return:
* > 0 If we can nocow, and updates @write_bytes.
* 0 If we can't do a nocow write.
- * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
+ * -EAGAIN If we can't do a nocow write because snapshotting of the inode's
* root is in progress or because we are in a non-blocking IO
* context and need to block (@nowait is true).
* < 0 If an error happened.
@@ -2460,9 +2459,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
* got EOPNOTSUPP via prealloc then we messed up and
* need to abort.
*/
- if (ret &&
- (ret != -EOPNOTSUPP ||
- (extent_info && extent_info->is_new_extent)))
+ if (unlikely(ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent))))
btrfs_abort_transaction(trans, ret);
break;
}
@@ -2473,7 +2472,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
cur_offset < ino_size) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
- if (ret) {
+ if (unlikely(ret)) {
/*
* If we failed then we didn't insert our hole
* entries for the area we dropped, so now the
@@ -2493,7 +2492,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
- if (ret) {
+ if (unlikely(ret)) {
/*
* We couldn't clear our area, so we could
* presumably adjust up and corrupt the fs, so
@@ -2512,7 +2511,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, replace_len,
drop_args.bytes_found);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -2607,7 +2606,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
- if (ret) {
+ if (unlikely(ret)) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2616,7 +2615,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
drop_args.drop_end - cur_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
@@ -2626,7 +2625,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
@@ -3345,7 +3344,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* We could also use the extent map tree to find such delalloc that is
* being flushed, but using the ordered extents tree is more efficient
* because it's usually much smaller as ordered extents are removed from
- * the tree once they complete. With the extent maps, we mau have them
+ * the tree once they complete. With the extent maps, we may have them
* in the extent map tree for a very long time, and they were either
* created by previous writes or loaded by read operations.
*/
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5d8d1570a5c9..ab873bd67192 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2282,7 +2282,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
* to reserve them to larger extents, however if we have plenty
- * of cache left then go ahead an dadd them, no sense in adding
+ * of cache left then go ahead and add them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
if (info->bytes <= fs_info->sectorsize * 8) {
@@ -3829,7 +3829,7 @@ out_unlock:
/*
* If we break out of trimming a bitmap prematurely, we should reset the
- * trimming bit. In a rather contrieved case, it's possible to race here so
+ * trimming bit. In a rather contrived case, it's possible to race here so
* reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
*
* start = start of bitmap
@@ -4142,7 +4142,7 @@ int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool act
if (!active) {
set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
ret = cleanup_free_space_cache_v1(fs_info, trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index eba7f22ae49c..dad0b492a663 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -137,12 +137,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
DEBUG_WARN();
return -EIO;
}
- if (p->slots[0] == 0) {
+ if (unlikely(p->slots[0] == 0)) {
DEBUG_WARN("no previous slot found");
return -EIO;
}
@@ -218,7 +218,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
- if (!bitmap) {
+ if (unlikely(!bitmap)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -233,7 +233,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -271,7 +271,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -293,7 +293,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
@@ -320,7 +320,7 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
data_size);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -361,7 +361,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
- if (!bitmap) {
+ if (unlikely(!bitmap)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -376,7 +376,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -420,7 +420,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -454,7 +454,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
key.offset = (end_bit - start_bit) * fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -465,7 +465,7 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
@@ -848,14 +848,14 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
return 0;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
- if (!block_group) {
+ if (unlikely(!block_group)) {
DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
btrfs_abort_transaction(trans, ret);
@@ -1030,14 +1030,14 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
return 0;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
- if (!block_group) {
+ if (unlikely(!block_group)) {
DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
btrfs_abort_transaction(trans, ret);
@@ -1185,7 +1185,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_put_root(free_space_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -1197,7 +1197,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out_clear;
@@ -1290,14 +1290,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
ret = btrfs_del_root(trans, &free_space_root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1315,7 +1315,7 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1344,7 +1344,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1362,7 +1362,7 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
goto next;
ret = populate_free_space_tree(trans, block_group);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1422,7 +1422,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
if (!path) {
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
btrfs_abort_transaction(trans, -ENOMEM);
return -ENOMEM;
}
@@ -1430,7 +1430,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
}
ret = add_new_free_space_info(trans, block_group, path);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1481,7 +1481,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1496,7 +1496,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1527,7 +1527,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1611,7 +1611,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
extent_count++;
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
@@ -1672,7 +1672,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
extent_count++;
}
- if (extent_count != expected_extent_count) {
+ if (unlikely(extent_count != expected_extent_count)) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index b2bb86f8d7cf..feb0a2faa837 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -55,6 +55,54 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
}
/*
+ * We support the following block sizes for all systems:
+ *
+ * - 4K
+ * This is the most common block size. For PAGE SIZE > 4K cases the subpage
+ * mode is used.
+ *
+ * - PAGE_SIZE
+ * The straightforward block size to support.
+ *
+ * And extra support for the following block sizes based on the kernel config:
+ *
+ * - MIN_BLOCKSIZE
+ * This is either 4K (regular builds) or 2K (debug builds)
+ * This allows testing subpage routines on x86_64.
+ */
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
+{
+ /* @blocksize should be validated first. */
+ ASSERT(is_power_of_2(blocksize) && blocksize >= BTRFS_MIN_BLOCKSIZE &&
+ blocksize <= BTRFS_MAX_BLOCKSIZE);
+
+ if (blocksize == PAGE_SIZE || blocksize == SZ_4K || blocksize == BTRFS_MIN_BLOCKSIZE)
+ return true;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /*
+ * For bs > ps support it's done by specifying a minimal folio order
+ * for filemap, thus implying large data folios.
+ * For HIGHMEM systems, we can not always access the content of a (large)
+ * folio in one go, but go through them page by page.
+ *
+ * A lot of features don't implement a proper PAGE sized loop for large
+ * folios, this includes:
+ *
+ * - compression
+ * - verity
+ * - encoded write
+ *
+ * Considering HIGHMEM is such a pain to deal with and it's going
+ * to be deprecated eventually, just reject HIGHMEM && bs > ps cases.
+ */
+ if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
+ return false;
+ return true;
+#endif
+ return false;
+}
+
+/*
* Start exclusive operation @type, return true on success.
*/
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 8cc07cc70b12..814bbc9417d2 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -59,6 +59,8 @@ struct btrfs_space_info;
#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
#endif
+#define BTRFS_MAX_BLOCKSIZE (SZ_64K)
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -102,6 +104,8 @@ enum {
BTRFS_FS_STATE_RO,
/* Track if a transaction abort has been reported on this filesystem */
BTRFS_FS_STATE_TRANS_ABORTED,
+ /* Track if log replay has failed. */
+ BTRFS_FS_STATE_LOG_REPLAY_ABORTED,
/*
* Bio operations should be blocked on this filesystem because a source
* or target device is being destroyed as part of a device replace
@@ -243,6 +247,7 @@ enum {
BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30),
BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31),
BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32),
+ BTRFS_MOUNT_REF_TRACKER = (1ULL << 33),
};
/*
@@ -280,7 +285,7 @@ enum {
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
- * Features under developmen like Extent tree v2 support is enabled
+ * Features under development like Extent tree v2 support is enabled
* only under CONFIG_BTRFS_EXPERIMENTAL
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
@@ -303,6 +308,16 @@ enum {
#define BTRFS_WARNING_COMMIT_INTERVAL (300)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_ZSTD = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
+
+ BTRFS_DEFRAG_DONT_COMPRESS,
+};
+
struct btrfs_dev_replace {
/* See #define above */
u64 replace_state;
@@ -505,6 +520,9 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long long mount_opt;
+ /* Compress related structures. */
+ void *compr_wsm[BTRFS_NR_COMPRESS_TYPES];
+
int compress_type;
int compress_level;
u32 commit_interval;
@@ -809,6 +827,8 @@ struct btrfs_fs_info {
u32 sectorsize;
/* ilog2 of sectorsize, use to avoid 64bit division */
u32 sectorsize_bits;
+ u32 block_min_order;
+ u32 block_max_order;
u32 csum_size;
u32 csums_per_leaf;
u32 stripesize;
@@ -878,12 +898,10 @@ struct btrfs_fs_info {
struct lockdep_map btrfs_trans_pending_ordered_map;
struct lockdep_map btrfs_ordered_extent_map;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
spinlock_t ref_verify_lock;
struct rb_root block_tree;
-#endif
-#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct list_head allocated_roots;
@@ -905,6 +923,12 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
+/* Return the minimal folio size of the fs. */
+static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info)
+{
+ return 1U << (PAGE_SHIFT + fs_info->block_min_order);
+}
+
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->generation);
@@ -997,6 +1021,7 @@ static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs
return folio_size(folio) >> fs_info->sectorsize_bits;
}
+bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize);
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
@@ -1107,9 +1132,9 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
#define EXPORT_FOR_TESTS
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
- return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state));
}
void btrfs_test_destroy_inode(struct inode *inode);
@@ -1118,9 +1143,9 @@ void btrfs_test_destroy_inode(struct inode *inode);
#define EXPORT_FOR_TESTS static
-static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
+static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
- return 0;
+ return false;
}
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index f06cf701ae5a..1bd73b80f9fa 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -137,7 +137,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name);
- if (!extref) {
+ if (unlikely(!extref)) {
btrfs_abort_transaction(trans, -ENOENT);
return -ENOENT;
}
@@ -627,7 +627,7 @@ delete:
if (control->clear_extent_range) {
ret = btrfs_inode_clear_file_extent_range(control->inode,
clear_start, clear_len);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -666,7 +666,7 @@ delete:
btrfs_init_data_ref(&ref, control->ino, extent_offset,
btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -684,7 +684,7 @@ delete:
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
pending_del_nr);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -720,7 +720,7 @@ out:
int ret2;
ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr);
- if (ret2) {
+ if (unlikely(ret2)) {
btrfs_abort_transaction(trans, ret2);
ret = ret2;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e4aec7330cb..ced87c9e4682 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -72,6 +72,9 @@
#include "raid-stripe-tree.h"
#include "fiemap.h"
+#define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0)
+#define COW_FILE_RANGE_NO_INLINE (1UL << 1)
+
struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
@@ -367,7 +370,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
}
/*
- * Unock inode i_rwsem.
+ * Unlock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
@@ -631,7 +634,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
drop_args.replace_extent = true;
drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -639,7 +642,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
size, compressed_size, compress_type,
compressed_folio, update_i_size);
- if (ret && ret != -ENOSPC) {
+ if (unlikely(ret && ret != -ENOSPC)) {
btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
@@ -649,7 +652,7 @@ static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
- if (ret && ret != -ENOSPC) {
+ if (unlikely(ret && ret != -ENOSPC)) {
btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
@@ -851,6 +854,8 @@ static void compress_file_range(struct btrfs_work *work)
struct btrfs_inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
@@ -861,7 +866,7 @@ static void compress_file_range(struct btrfs_work *work)
unsigned long nr_folios;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned int poff;
+ unsigned int loff;
int i;
int compress_type = fs_info->compress_type;
int compress_level = fs_info->compress_level;
@@ -899,8 +904,8 @@ static void compress_file_range(struct btrfs_work *work)
actual_end = min_t(u64, i_size, end + 1);
again:
folios = NULL;
- nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
+ nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
+ nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
/*
* we don't want to send crud past the end of i_size through
@@ -956,18 +961,18 @@ again:
/* Compression level is applied here. */
ret = btrfs_compress_folios(compress_type, compress_level,
- mapping, start, folios, &nr_folios, &total_in,
+ inode, start, folios, &nr_folios, &total_in,
&total_compressed);
if (ret)
goto mark_incompressible;
/*
- * Zero the tail end of the last page, as we might be sending it down
+ * Zero the tail end of the last folio, as we might be sending it down
* to disk.
*/
- poff = offset_in_page(total_compressed);
- if (poff)
- folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
+ loff = (total_compressed & (min_folio_size - 1));
+ if (loff)
+ folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
/*
* Try to create an inline extent.
@@ -1245,18 +1250,18 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * When this function fails, it unlocks all pages except @locked_folio.
+ * When this function fails, it unlocks all folios except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_folio and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_folio is
- * the only page handled anyway).
+ * unlocks all folios including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single block, so locked_folio is
+ * the only folio handled anyway).
*
- * When this function succeed and creates a normal extent, the page locking
+ * When this function succeed and creates a normal extent, the folio locking
* status depends on the passed in flags:
*
- * - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_folio are unlocked.
+ * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
+ * - Else all folios except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are cleaned up.
@@ -1264,7 +1269,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
static noinline int cow_file_range(struct btrfs_inode *inode,
struct folio *locked_folio, u64 start,
u64 end, u64 *done_offset,
- bool keep_locked, bool no_inline)
+ unsigned long flags)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1292,7 +1297,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (!no_inline) {
+ if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
@@ -1320,7 +1325,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* Do set the Ordered (Private2) bit so we know this page was properly
* setup for writepage.
*/
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
/*
@@ -1531,10 +1536,11 @@ out_unlock:
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
- btrfs_err_rl(fs_info,
- "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
- __func__, btrfs_root_id(inode->root),
- btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
+ btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), orig_start, end + 1 - orig_start,
+ start, cur_alloc_size, ret);
return ret;
}
@@ -1687,7 +1693,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
while (start <= end) {
ret = cow_file_range(inode, locked_folio, start, end,
- &done_offset, true, false);
+ &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
if (ret)
return ret;
extent_write_locked_range(&inode->vfs_inode, locked_folio,
@@ -1768,9 +1774,15 @@ static int fallback_to_cow(struct btrfs_inode *inode,
* Don't try to create inline extents, as a mix of inline extent that
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
+ *
+ * And here we do not unlock the folio after a successful run.
+ * The folios will be unlocked after everything is finished, or by error handling.
+ *
+ * This is to ensure error handling won't need to clear dirty/ordered flags without
+ * a locked folio, which can race with writeback.
*/
- ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
- true);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL,
+ COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
ASSERT(ret != 1);
return ret;
}
@@ -1913,61 +1925,14 @@ static int can_nocow_file_extent(struct btrfs_path *path,
return ret < 0 ? ret : can_nocow;
}
-/*
- * Cleanup the dirty folios which will never be submitted due to error.
- *
- * When running a delalloc range, we may need to split the ranges (due to
- * fragmentation or NOCOW). If we hit an error in the later part, we will error
- * out and previously successfully executed range will never be submitted, thus
- * we have to cleanup those folios by clearing their dirty flag, starting and
- * finishing the writeback.
- */
-static void cleanup_dirty_folios(struct btrfs_inode *inode,
- struct folio *locked_folio,
- u64 start, u64 end, int error)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- pgoff_t start_index = start >> PAGE_SHIFT;
- pgoff_t end_index = end >> PAGE_SHIFT;
- u32 len;
-
- ASSERT(end + 1 - start < U32_MAX);
- ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
- IS_ALIGNED(end + 1, fs_info->sectorsize));
- len = end + 1 - start;
-
- /*
- * Handle the locked folio first.
- * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
- */
- btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
-
- for (pgoff_t index = start_index; index <= end_index; index++) {
- struct folio *folio;
-
- /* Already handled at the beginning. */
- if (index == locked_folio->index)
- continue;
- folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
- /* Cache already dropped, no need to do any cleanup. */
- if (IS_ERR(folio))
- continue;
- btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
- folio_unlock(folio);
- folio_put(folio);
- }
- mapping_set_error(mapping, error);
-}
-
static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
struct extent_state **cached,
struct can_nocow_file_extent_args *nocow_args,
u64 file_pos, bool is_prealloc)
{
struct btrfs_ordered_extent *ordered;
- u64 len = nocow_args->file_extent.num_bytes;
- u64 end = file_pos + len - 1;
+ const u64 len = nocow_args->file_extent.num_bytes;
+ const u64 end = file_pos + len - 1;
int ret = 0;
btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
@@ -1978,8 +1943,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
- return PTR_ERR(em);
+ ret = PTR_ERR(em);
+ goto error;
}
btrfs_free_extent_map(em);
}
@@ -1991,8 +1956,8 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
if (IS_ERR(ordered)) {
if (is_prealloc)
btrfs_drop_extent_map_range(inode, file_pos, end, false);
- btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
- return PTR_ERR(ordered);
+ ret = PTR_ERR(ordered);
+ goto error;
}
if (btrfs_is_data_reloc_root(inode->root))
@@ -2004,23 +1969,30 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
ret = btrfs_reloc_clone_csums(ordered);
btrfs_put_ordered_extent(ordered);
+ if (ret < 0)
+ goto error;
extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
- /*
- * On error, we need to cleanup the ordered extents we created.
- *
- * We do not clear the folio Dirty flags because they are set and
- * cleaered by the caller.
- */
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, file_pos, len);
+ PAGE_SET_ORDERED);
+ return ret;
+
+error:
+ btrfs_cleanup_ordered_extents(inode, file_pos, len);
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ btrfs_err(inode->root->fs_info,
+ "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+ file_pos, len, ret);
return ret;
}
/*
- * when nowcow writeback call back. This checks for snapshots or COW copies
+ * When nocow writeback calls back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
* If no cow copies or snapshots exist, we write directly to the existing
@@ -2037,13 +2009,23 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
/*
* If not 0, represents the inclusive end of the last fallback_to_cow()
* range. Only for error handling.
+ *
+ * The same for nocow_end, it's to avoid double cleaning up the range
+ * already cleaned by nocow_one_range().
*/
u64 cow_end = 0;
+ u64 nocow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
u64 ino = btrfs_ino(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
+ /* The range that has ordered extent(s). */
+ u64 oe_cleanup_start;
+ u64 oe_cleanup_len = 0;
+ /* The range that is untouched. */
+ u64 untouched_start;
+ u64 untouched_len = 0;
/*
* Normally on a zoned device we're only doing COW writes, but in case
@@ -2207,8 +2189,10 @@ must_cow:
&nocow_args, cur_offset,
extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (ret < 0)
+ if (ret < 0) {
+ nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
goto error;
+ }
cur_offset = extent_end;
}
btrfs_release_path(path);
@@ -2225,86 +2209,105 @@ must_cow:
cow_start = (u64)-1;
}
- btrfs_free_path(path);
- return 0;
-
-error:
/*
- * There are several error cases:
- *
- * 1) Failed without falling back to COW
- * start cur_offset end
- * |/////////////| |
- *
- * In this case, cow_start should be (u64)-1.
- *
- * For range [start, cur_offset) the folios are already unlocked (except
- * @locked_folio), EXTENT_DELALLOC already removed.
- * Need to clear the dirty flags and finish the ordered extents.
- *
- * 2) Failed with error before calling fallback_to_cow()
- *
- * start cow_start end
- * |/////////////| |
- *
- * In this case, only @cow_start is set, @cur_offset is between
- * [cow_start, end)
- *
- * It's mostly the same as case 1), just replace @cur_offset with
- * @cow_start.
- *
- * 3) Failed with error from fallback_to_cow()
+ * Everything is finished without an error, can unlock the folios now.
*
- * start cow_start cow_end end
- * |/////////////|-----------| |
- *
- * In this case, both @cow_start and @cow_end is set.
- *
- * For range [start, cow_start) it's the same as case 1).
- * But for range [cow_start, cow_end), all the cleanup is handled by
- * cow_file_range(), we should not touch anything in that range.
- *
- * So for all above cases, if @cow_start is set, cleanup ordered extents
- * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
+ * No need to touch the io tree range nor set folio ordered flag, as
+ * fallback_to_cow() and nocow_one_range() have already handled them.
*/
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
+ extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
- if (cur_offset > start) {
- btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
- cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
- }
+ btrfs_free_path(path);
+ return 0;
- /*
- * If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to @cow_end + 1 to skip the COW range, as
- * cow_file_range() will do the proper cleanup at error.
- */
- if (cow_end)
- cur_offset = cow_end + 1;
+error:
+ if (cow_start == (u64)-1) {
+ /*
+ * case a)
+ * start cur_offset end
+ * | OE cleanup | Untouched |
+ *
+ * We finished a fallback_to_cow() or nocow_one_range() call,
+ * but failed to check the next range.
+ *
+ * or
+ * start cur_offset nocow_end end
+ * | OE cleanup | Skip | Untouched |
+ *
+ * nocow_one_range() failed, the range [cur_offset, nocow_end] is
+ * already cleaned up.
+ */
+ oe_cleanup_start = start;
+ oe_cleanup_len = cur_offset - start;
+ if (nocow_end)
+ untouched_start = nocow_end + 1;
+ else
+ untouched_start = cur_offset;
+ untouched_len = end + 1 - untouched_start;
+ } else if (cow_start != (u64)-1 && cow_end == 0) {
+ /*
+ * case b)
+ * start cow_start cur_offset end
+ * | OE cleanup | Untouched |
+ *
+ * We got a range that needs COW, but before we hit the next NOCOW range,
+ * thus [cow_start, cur_offset) doesn't yet have any OE.
+ */
+ oe_cleanup_start = start;
+ oe_cleanup_len = cow_start - start;
+ untouched_start = cow_start;
+ untouched_len = end + 1 - untouched_start;
+ } else {
+ /*
+ * case c)
+ * start cow_start cow_end end
+ * | OE cleanup | Skip | Untouched |
+ *
+ * fallback_to_cow() failed, and fallback_to_cow() will do the
+ * cleanup for its range, we shouldn't touch the range
+ * [cow_start, cow_end].
+ */
+ ASSERT(cow_start != (u64)-1 && cow_end != 0);
+ oe_cleanup_start = start;
+ oe_cleanup_len = cow_start - start;
+ untouched_start = cow_end + 1;
+ untouched_len = end + 1 - untouched_start;
+ }
+
+ if (oe_cleanup_len) {
+ const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
+ btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
+ extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
+ locked_folio, NULL,
+ EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK);
+ }
- /*
- * We need to lock the extent here because we're clearing DELALLOC and
- * we're not locked at this point.
- */
- if (cur_offset < end) {
+ if (untouched_len) {
struct extent_state *cached = NULL;
+ const u64 untouched_end = untouched_start + untouched_len - 1;
- btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached);
- extent_clear_unlock_delalloc(inode, cur_offset, end,
+ /*
+ * We need to lock the extent here because we're clearing DELALLOC and
+ * we're not locked at this point.
+ */
+ btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
+ extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
- btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
+ btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
}
btrfs_free_path(path);
- btrfs_err_rl(fs_info,
- "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
- __func__, btrfs_root_id(inode->root),
- btrfs_ino(inode), start, end + 1 - start, ret);
+ btrfs_err(fs_info,
+"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
+ __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
+ start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
+ untouched_start, untouched_len, ret);
return ret;
}
@@ -2349,8 +2352,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
- ret = cow_file_range(inode, locked_folio, start, end, NULL,
- false, false);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
return ret;
}
@@ -2986,7 +2988,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
* number of bytes only for that range containing the inline extent.
- * The remaining of the range will be processed when clearning the
+ * The remaining of the range will be processed when clearing the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
@@ -3102,14 +3104,15 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
if (!freespace_inode)
btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
- if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+ if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
ret = -EIO;
goto out;
}
- if (btrfs_is_zoned(fs_info))
- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes);
+ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ if (ret)
+ goto out;
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -3147,7 +3150,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3155,7 +3158,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
/* Logic error */
ASSERT(list_empty(&ordered_extent->list));
- if (!list_empty(&ordered_extent->list)) {
+ if (unlikely(!list_empty(&ordered_extent->list))) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3163,7 +3166,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
/* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
}
@@ -3190,20 +3193,20 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = add_pending_csums(trans, &ordered_extent->list);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3221,7 +3224,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) { /* -ENOMEM or corruption */
+ if (unlikely(ret)) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3327,21 +3330,47 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
return btrfs_finish_one_ordered(ordered);
}
+void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr,
+ u8 *dest)
+{
+ struct folio *folio = page_folio(phys_to_page(paddr));
+ const u32 blocksize = fs_info->sectorsize;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+
+ shash->tfm = fs_info->csum_shash;
+ /* The full block must be inside the folio. */
+ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
+
+ if (folio_test_partial_kmap(folio)) {
+ size_t cur = paddr;
+
+ crypto_shash_init(shash);
+ while (cur < paddr + blocksize) {
+ void *kaddr;
+ size_t len = min(paddr + blocksize - cur,
+ PAGE_SIZE - offset_in_page(cur));
+
+ kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur));
+ crypto_shash_update(shash, kaddr, len);
+ kunmap_local(kaddr);
+ cur += len;
+ }
+ crypto_shash_final(shash, dest);
+ } else {
+ crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest);
+ }
+}
/*
* Verify the checksum for a single sector without any extra action that depend
* on the type of I/O.
*
* @kaddr must be a properly kmapped address.
*/
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
- const u8 * const csum_expected)
+int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
+ const u8 * const csum_expected)
{
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
-
- if (memcmp(csum, csum_expected, fs_info->csum_size))
+ btrfs_calculate_block_csum(fs_info, paddr, csum);
+ if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
return -EIO;
return 0;
}
@@ -3360,17 +3389,16 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum
* Return %true if the sector is ok or had no checksum to start with, else %false.
*/
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
- u32 bio_offset, struct bio_vec *bv)
+ u32 bio_offset, phys_addr_t paddr)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 blocksize = fs_info->sectorsize;
+ struct folio *folio;
u64 file_offset = bbio->file_offset + bio_offset;
- u64 end = file_offset + bv->bv_len - 1;
+ u64 end = file_offset + blocksize - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- void *kaddr;
-
- ASSERT(bv->bv_len == fs_info->sectorsize);
if (!bbio->csum)
return true;
@@ -3386,12 +3414,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
fs_info->csum_size;
- kaddr = bvec_kmap_local(bv);
- if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) {
- kunmap_local(kaddr);
+ if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected))
goto zeroit;
- }
- kunmap_local(kaddr);
return true;
zeroit:
@@ -3399,7 +3423,9 @@ zeroit:
bbio->mirror_num);
if (dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memzero_bvec(bv);
+ folio = page_folio(phys_to_page(paddr));
+ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
+ folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize);
return false;
}
@@ -3513,7 +3539,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
int ret;
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
- if (ret && ret != -EEXIST) {
+ if (unlikely(ret && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3885,10 +3911,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(inode);
- if (ret)
- goto out;
-
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3920,8 +3942,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
@@ -3953,6 +3973,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_set_inode_mapping_order(inode);
cache_index:
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
@@ -4263,7 +4288,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_crit(fs_info,
"failed to delete reference to %.*s, root %llu inode %llu parent %llu",
name->len, name->name, btrfs_root_id(root), ino, dir_ino);
@@ -4275,7 +4300,7 @@ skip_backref:
rename_ctx->index = index;
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -4430,7 +4455,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4461,14 +4486,14 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = btrfs_del_root_ref(trans, objectid,
btrfs_root_id(root), dir_ino,
&index, &fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4526,7 +4551,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
@@ -4557,7 +4582,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ if (icount_read(&inode->vfs_inode) > 1)
d_prune_aliases(&inode->vfs_inode);
min_ino = btrfs_ino(inode) + 1;
@@ -4640,13 +4665,13 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_record_snapshot_destroy(trans, dir);
ret = btrfs_unlink_subvol(trans, dir, dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
ret = btrfs_record_root_in_trans(trans, dest);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4660,7 +4685,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
btrfs_root_id(dest));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4668,7 +4693,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4677,7 +4702,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(dest));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4817,7 +4842,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto out_unlock;
}
@@ -4905,7 +4930,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e
goto out;
/*
- * Skip the truncatioin if the range in the target block is already aligned.
+ * Skip the truncation if the range in the target block is already aligned.
* The seemingly complex check will also handle the same block case.
*/
if (in_head_block && !IS_ALIGNED(start, blocksize))
@@ -4961,7 +4986,7 @@ again:
folio_put(folio);
goto again;
}
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto out_unlock;
}
@@ -5081,7 +5106,7 @@ static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -5601,8 +5626,8 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
- if (location->type != BTRFS_INODE_ITEM_KEY &&
- location->type != BTRFS_ROOT_ITEM_KEY) {
+ if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
+ location->type != BTRFS_ROOT_ITEM_KEY)) {
ret = -EUCLEAN;
btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
@@ -5696,7 +5721,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
bool empty = false;
xa_lock(&root->inodes);
- entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ /*
+ * This btrfs_inode is being freed and has already been unhashed at this
+ * point. It's possible that another btrfs_inode has already been
+ * allocated for the same inode and inserted itself into the root, so
+ * don't delete it in that case.
+ *
+ * Note that this shouldn't need to allocate memory, so the gfp flags
+ * don't really matter.
+ */
+ entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+ GFP_ATOMIC);
if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
@@ -5883,7 +5918,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
- if (btrfs_inode_type(inode) != di_type) {
+ if (unlikely(btrfs_inode_type(inode) != di_type)) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
inode->vfs_inode.i_mode, btrfs_inode_type(inode),
@@ -6470,6 +6505,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (!args->subvol)
btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
+ btrfs_set_inode_mapping_order(BTRFS_I(inode));
if (S_ISREG(inode->i_mode)) {
if (btrfs_test_opt(fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
@@ -6477,7 +6513,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
btrfs_update_inode_mapping_flags(BTRFS_I(inode));
- btrfs_set_inode_mapping_order(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode);
@@ -6524,7 +6559,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
batch.nr = args->orphan ? 1 : 2;
ret = btrfs_insert_empty_items(trans, root, path, &batch);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6601,7 +6636,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
*/
if (!args->subvol) {
ret = btrfs_init_inode_security(trans, args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6621,14 +6656,14 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (args->orphan) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
0, BTRFS_I(inode)->dir_index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto discard;
}
@@ -6659,7 +6694,7 @@ out:
*/
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
- const struct fscrypt_str *name, int add_backref, u64 index)
+ const struct fscrypt_str *name, bool add_backref, u64 index)
{
int ret = 0;
struct btrfs_key key;
@@ -6692,7 +6727,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
- else if (ret) {
+ else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -6805,7 +6840,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct fscrypt_name fname;
u64 index;
int ret;
- int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6837,44 +6871,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
- inc_nlink(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
+ if (ret)
+ goto fail;
- if (ret) {
- drop_inode = 1;
- } else {
- struct dentry *parent = dentry->d_parent;
+ /* Link added now we update the inode item with the new link count. */
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret)
+ if (inode->i_nlink == 1) {
+ /*
+ * If the new hard link count is 1, it's a file created with the
+ * open(2) O_TMPFILE flag.
+ */
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
- if (inode->i_nlink == 1) {
- /*
- * If new hard link count is 1, it's a file created
- * with open(2) O_TMPFILE flag.
- */
- ret = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (ret)
- goto fail;
}
- d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
+ /* Grab reference for the new dentry passed to d_instantiate(). */
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
+
fail:
fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
- if (drop_inode) {
- inode_dec_link_count(inode);
- iput(inode);
- }
btrfs_btree_balance_dirty(fs_info);
return ret;
}
@@ -7068,7 +7102,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
- if (!S_ISREG(inode->vfs_inode.i_mode)) {
+ if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
@@ -7145,7 +7179,7 @@ not_found:
insert:
ret = 0;
btrfs_release_path(path);
- if (em->start > start || btrfs_extent_map_end(em) <= start) {
+ if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
@@ -7830,6 +7864,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ /* new_delalloc_bytes and last_dir_index_offset are in a union. */
ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
@@ -7964,7 +7999,7 @@ int btrfs_drop_inode(struct inode *inode)
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
static void init_once(void *foo)
@@ -7972,6 +8007,9 @@ static void init_once(void *foo)
struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
void __cold btrfs_destroy_cachep(void)
@@ -8173,7 +8211,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
if (ret) {
- if (need_abort)
+ if (unlikely(need_abort))
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8221,7 +8259,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8229,12 +8267,12 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8243,7 +8281,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8251,12 +8289,12 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8264,14 +8302,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
new_name, 0, old_idx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
old_name, 0, new_idx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8512,7 +8550,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8520,12 +8558,12 @@ static int btrfs_rename(struct mnt_idmap *idmap,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8536,7 +8574,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8545,7 +8583,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
&new_fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8553,7 +8591,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry)));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8562,7 +8600,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
&new_fname.disk_name, 0, index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -8576,7 +8614,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (flags & RENAME_WHITEOUT) {
ret = btrfs_create_new_inode(trans, &whiteout_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
} else {
@@ -8870,7 +8908,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
discard_new_inode(inode);
@@ -8882,7 +8920,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
discard_new_inode(inode);
@@ -9095,7 +9133,7 @@ next:
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans);
@@ -9263,7 +9301,7 @@ static ssize_t btrfs_encoded_read_inline(
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
/* The extent item disappeared? */
return -EIO;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7e13de2bdcbf..a454b5ba2097 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -376,13 +376,13 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (comp) {
ret = btrfs_set_prop(trans, inode, "btrfs.compression",
comp, strlen(comp), 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
- if (ret && ret != -ENODATA) {
+ if (unlikely(ret && ret != -ENODATA)) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -633,7 +633,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
- if (ret2 < 0)
+ if (unlikely(ret2 < 0))
btrfs_abort_transaction(trans, ret2);
free_extent_buffer(leaf);
goto out;
@@ -654,14 +654,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
/* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -669,7 +669,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_record_new_subvolume(trans, BTRFS_I(dir));
ret = btrfs_create_new_inode(trans, &new_inode_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -957,7 +957,7 @@ static noinline int btrfs_mksnapshot(struct dentry *parent,
/*
* Force new buffered writes to reserve space even when NOCOW is
- * possible. This is to avoid later writeback (running dealloc) to
+ * possible. This is to avoid later writeback (running delalloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
btrfs_drew_read_lock(&root->snapshot_lock);
@@ -1251,7 +1251,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;
@@ -2133,7 +2133,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
ret = btrfs_next_leaf(fs_info->tree_root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2216,7 +2216,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -2245,7 +2245,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
ret = btrfs_next_item(root, path);
if (ret < 0) {
goto out;
- } else if (ret > 0) {
+ } else if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -4008,7 +4008,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(root));
- if (ret && ret != -ENOENT) {
+ if (unlikely(ret && ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
@@ -4032,7 +4032,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
btrfs_root_id(root));
- if (ret < 0 && ret != -EEXIST) {
+ if (unlikely(ret < 0 && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
@@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
+ if (fs_info->sectorsize > PAGE_SIZE) {
+ ret = -ENOTTY;
+ goto out_acct;
+ }
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_encoded_io_args_32 args32;
@@ -4509,6 +4513,7 @@ out_acct:
static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
{
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
struct btrfs_ioctl_encoded_io_args args;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
@@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_acct;
}
+ if (fs_info->sectorsize > PAGE_SIZE) {
+ ret = -ENOTTY;
+ goto out_acct;
+ }
+
if (!(file->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out_acct;
@@ -4780,14 +4790,14 @@ out_fail:
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ struct file *file = cmd->file;
+ struct btrfs_inode *inode = BTRFS_I(file->f_inode);
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
size_t copy_end;
int ret;
u64 disk_bytenr, disk_io_size;
- struct file *file;
- struct btrfs_inode *inode;
- struct btrfs_fs_info *fs_info;
- struct extent_io_tree *io_tree;
loff_t pos;
struct kiocb kiocb;
struct extent_state *cached_state = NULL;
@@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
ret = -EPERM;
goto out_acct;
}
- file = cmd->file;
- inode = BTRFS_I(file->f_inode);
- fs_info = inode->root->fs_info;
- io_tree = &inode->io_tree;
+ if (fs_info->sectorsize > PAGE_SIZE) {
+ ret = -ENOTTY;
+ goto out_acct;
+ }
+
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (issue_flags & IO_URING_F_COMPAT) {
@@ -4933,9 +4944,10 @@ out_acct:
static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
+ struct file *file = cmd->file;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
loff_t pos;
struct kiocb kiocb;
- struct file *file;
ssize_t ret;
void __user *sqe_addr;
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
@@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
ret = -EPERM;
goto out_acct;
}
+ if (fs_info->sectorsize > PAGE_SIZE) {
+ ret = -ENOTTY;
+ goto out_acct;
+ }
- file = cmd->file;
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (!(file->f_mode & FMODE_WRITE)) {
@@ -5223,13 +5238,13 @@ long btrfs_ioctl(struct file *file, unsigned int
case FITRIM:
return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 0);
+ return btrfs_ioctl_snap_create(file, argp, false);
case BTRFS_IOC_SNAP_CREATE_V2:
- return btrfs_ioctl_snap_create_v2(file, argp, 0);
+ return btrfs_ioctl_snap_create_v2(file, argp, false);
case BTRFS_IOC_SUBVOL_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 1);
+ return btrfs_ioctl_snap_create(file, argp, true);
case BTRFS_IOC_SUBVOL_CREATE_V2:
- return btrfs_ioctl_snap_create_v2(file, argp, 1);
+ return btrfs_ioctl_snap_create_v2(file, argp, true);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp, false);
case BTRFS_IOC_SNAP_DESTROY_V2:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a3e6d9616e60..0035851d72b0 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -361,7 +361,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
atomic_inc(&lock->readers);
/*
- * Ensure the pending reader count is perceieved BEFORE this reader
+ * Ensure the pending reader count is perceived BEFORE this reader
* goes to sleep in case of active writers. This guarantees new writers
* won't be allowed and that the current reader will be woken up when
* the last active writer finishes its jobs.
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index af29df98ac14..a4673e7d95d7 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -74,7 +74,7 @@ enum btrfs_lock_nesting {
BTRFS_NESTING_NEW_ROOT,
/*
- * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so
* add this in here and add a static_assert to keep us from going over
* the limit. As of this writing we're limited to 8, and we're
* definitely using 8, hence this check to keep us from messing up in
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d403641889ca..4758f66da449 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -58,9 +58,6 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
-#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
-#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
-
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -68,7 +65,14 @@ struct workspace {
struct list_head list;
};
-static struct workspace_manager wsm;
+static u32 workspace_buf_length(const struct btrfs_fs_info *fs_info)
+{
+ return lzo1x_worst_compress(fs_info->sectorsize);
+}
+static u32 workspace_cbuf_length(const struct btrfs_fs_info *fs_info)
+{
+ return lzo1x_worst_compress(fs_info->sectorsize);
+}
void lzo_free_workspace(struct list_head *ws)
{
@@ -80,7 +84,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *lzo_alloc_workspace(void)
+struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info)
{
struct workspace *workspace;
@@ -89,8 +93,8 @@ struct list_head *lzo_alloc_workspace(void)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN);
- workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
- workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN);
+ workspace->buf = kvmalloc(workspace_buf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
+ workspace->cbuf = kvmalloc(workspace_cbuf_length(fs_info), GFP_KERNEL | __GFP_NOWARN);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -128,19 +132,21 @@ static inline size_t read_compress_length(const char *buf)
*
* Will allocate new pages when needed.
*/
-static int copy_compressed_data_to_page(char *compressed_data,
+static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info,
+ char *compressed_data,
size_t compressed_size,
struct folio **out_folios,
unsigned long max_nr_folio,
- u32 *cur_out,
- const u32 sectorsize)
+ u32 *cur_out)
{
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 sector_bytes_left;
u32 orig_out;
struct folio *cur_folio;
char *kaddr;
- if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+ if ((*cur_out >> min_folio_shift) >= max_nr_folio)
return -E2BIG;
/*
@@ -149,18 +155,17 @@ static int copy_compressed_data_to_page(char *compressed_data,
*/
ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
- cur_folio = out_folios[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out >> min_folio_shift];
/* Allocate a new page */
if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio();
+ cur_folio = btrfs_alloc_compr_folio(fs_info);
if (!cur_folio)
return -ENOMEM;
- out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+ out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
- kaddr = kmap_local_folio(cur_folio, 0);
- write_compress_length(kaddr + offset_in_page(*cur_out),
- compressed_size);
+ kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out));
+ write_compress_length(kaddr, compressed_size);
*cur_out += LZO_LEN;
orig_out = *cur_out;
@@ -172,20 +177,20 @@ static int copy_compressed_data_to_page(char *compressed_data,
kunmap_local(kaddr);
- if ((*cur_out / PAGE_SIZE) >= max_nr_folio)
+ if ((*cur_out >> min_folio_shift) >= max_nr_folio)
return -E2BIG;
- cur_folio = out_folios[*cur_out / PAGE_SIZE];
+ cur_folio = out_folios[*cur_out >> min_folio_shift];
/* Allocate a new page */
if (!cur_folio) {
- cur_folio = btrfs_alloc_compr_folio();
+ cur_folio = btrfs_alloc_compr_folio(fs_info);
if (!cur_folio)
return -ENOMEM;
- out_folios[*cur_out / PAGE_SIZE] = cur_folio;
+ out_folios[*cur_out >> min_folio_shift] = cur_folio;
}
kaddr = kmap_local_folio(cur_folio, 0);
- memcpy(kaddr + offset_in_page(*cur_out),
+ memcpy(kaddr + offset_in_folio(cur_folio, *cur_out),
compressed_data + *cur_out - orig_out, copy_len);
*cur_out += copy_len;
@@ -209,12 +214,15 @@ out:
return 0;
}
-int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
+int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
- const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
struct folio *folio_in = NULL;
char *sizes_ptr;
const unsigned long max_nr_folio = *out_folios;
@@ -263,9 +271,9 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len,
folios, max_nr_folio,
- &cur_out, sectorsize);
+ &cur_out);
if (ret < 0)
goto out;
@@ -280,8 +288,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- /* Check if we have reached page boundary */
- if (PAGE_ALIGNED(cur_in)) {
+ /* Check if we have reached folio boundary. */
+ if (IS_ALIGNED(cur_in, min_folio_size)) {
folio_put(folio_in);
folio_in = NULL;
}
@@ -298,7 +306,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
out:
if (folio_in)
folio_put(folio_in);
- *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ *out_folios = DIV_ROUND_UP(cur_out, min_folio_size);
return ret;
}
@@ -310,15 +318,16 @@ out:
static void copy_compressed_segment(struct compressed_bio *cb,
char *dest, u32 len, u32 *cur_in)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- struct folio *cur_folio;
- u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
- orig_in + len - *cur_in);
+ struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift];
+ u32 copy_len = min_t(u32, orig_in + len - *cur_in,
+ folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in));
ASSERT(copy_len);
- cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE];
memcpy_from_folio(dest + *cur_in - orig_in, cur_folio,
offset_in_folio(cur_folio, *cur_in), copy_len);
@@ -332,6 +341,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
struct workspace *workspace = list_entry(ws, struct workspace, list);
const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
char *kaddr;
int ret;
/* Compressed data length, can be unaligned */
@@ -378,14 +388,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
ASSERT(cur_in / sectorsize ==
(cur_in + LZO_LEN - 1) / sectorsize);
- cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE];
+ cur_folio = cb->compressed_folios[cur_in >> min_folio_shift];
ASSERT(cur_folio);
kaddr = kmap_local_folio(cur_folio, 0);
- seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in));
kunmap_local(kaddr);
cur_in += LZO_LEN;
- if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+ if (unlikely(seg_len > workspace_cbuf_length(fs_info))) {
struct btrfs_inode *inode = cb->bbio.inode;
/*
@@ -445,19 +455,19 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
- size_t max_segment_len = WORKSPACE_BUF_LENGTH;
+ size_t max_segment_len = workspace_buf_length(fs_info);
int ret = 0;
- if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
+ if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2))
return -EUCLEAN;
in_len = read_compress_length(data_in);
- if (in_len != srclen)
+ if (unlikely(in_len != srclen))
return -EUCLEAN;
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
- if (in_len != srclen - LZO_LEN * 2) {
+ if (unlikely(in_len != srclen - LZO_LEN * 2)) {
ret = -EUCLEAN;
goto out;
}
@@ -487,8 +497,7 @@ out:
return ret;
}
-const struct btrfs_compress_op btrfs_lzo_compress = {
- .workspace_manager = &wsm,
+const struct btrfs_compress_levels btrfs_lzo_compress = {
.max_level = 1,
.default_level = 1,
};
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 363fd28c0268..a0cf8effe008 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -18,6 +18,7 @@ static const char fs_state_chars[] = {
[BTRFS_FS_STATE_REMOUNTING] = 'M',
[BTRFS_FS_STATE_RO] = 0,
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_LOG_REPLAY_ABORTED] = 'O',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
[BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C',
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 022ebc89af85..4416c165644f 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -4,7 +4,6 @@
#define BTRFS_MESSAGES_H
#include <linux/types.h>
-#include <linux/types.h>
#include <linux/printk.h>
#include <linux/bug.h>
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index ff5eac84d819..60f9b000d644 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <linux/math64.h>
#include <linux/rbtree.h>
+#include <linux/bio.h>
/*
* Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
@@ -20,6 +21,54 @@
name = (1U << __ ## name ## _BIT), \
__ ## name ## _SEQ = __ ## name ## _BIT
+static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter)
+{
+ struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+ return bvec_phys(&bv);
+}
+
+/*
+ * Iterate bio using btrfs block size.
+ *
+ * This will handle large folio and highmem.
+ *
+ * @paddr: Physical memory address of each iteration
+ * @bio: The bio to iterate
+ * @iter: The bvec_iter (pointer) to use.
+ * @blocksize: The blocksize to iterate.
+ *
+ * This requires all folios in the bio to cover at least one block.
+ */
+#define btrfs_bio_for_each_block(paddr, bio, iter, blocksize) \
+ for (; (iter)->bi_size && \
+ (paddr = bio_iter_phys((bio), (iter)), 1); \
+ bio_advance_iter_single((bio), (iter), (blocksize)))
+
+/* Initialize a bvec_iter to the size of the specified bio. */
+static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ u32 bio_size = 0;
+ int i;
+
+ bio_for_each_bvec_all(bvec, bio, i)
+ bio_size += bvec->bv_len;
+
+ return (struct bvec_iter) {
+ .bi_sector = 0,
+ .bi_size = bio_size,
+ .bi_idx = 0,
+ .bi_bvec_done = 0,
+ };
+}
+
+#define btrfs_bio_for_each_block_all(paddr, bio, blocksize) \
+ for (struct bvec_iter iter = init_bvec_iter_for_bio(bio); \
+ (iter).bi_size && \
+ (paddr = bio_iter_phys((bio), &(iter)), 1); \
+ bio_advance_iter_single((bio), &(iter), (blocksize)))
+
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 74e38da9bd39..62b993fae54f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -6,12 +6,19 @@
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
+#include "file-item.h"
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
#include "volumes.h"
#include "raid-stripe-tree.h"
+/*
+ * Large enough buffer size for the stringification of any key type yet short
+ * enough to use the stack and avoid allocations.
+ */
+#define KEY_TYPE_BUF_SIZE 32
+
struct root_name_map {
u64 id;
const char *name;
@@ -227,21 +234,209 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
#endif
}
+static void print_timespec(const struct extent_buffer *eb,
+ struct btrfs_timespec *timespec,
+ const char *prefix, const char *suffix)
+{
+ const u64 secs = btrfs_timespec_sec(eb, timespec);
+ const u32 nsecs = btrfs_timespec_nsec(eb, timespec);
+
+ pr_info("%s%llu.%u%s", prefix, secs, nsecs, suffix);
+}
+
+static void print_inode_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_inode_item *ii = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+
+ pr_info("\t\tinode generation %llu transid %llu size %llu nbytes %llu\n",
+ btrfs_inode_generation(eb, ii), btrfs_inode_transid(eb, ii),
+ btrfs_inode_size(eb, ii), btrfs_inode_nbytes(eb, ii));
+ pr_info("\t\tblock group %llu mode %o links %u uid %u gid %u\n",
+ btrfs_inode_block_group(eb, ii), btrfs_inode_mode(eb, ii),
+ btrfs_inode_nlink(eb, ii), btrfs_inode_uid(eb, ii),
+ btrfs_inode_gid(eb, ii));
+ pr_info("\t\trdev %llu sequence %llu flags 0x%llx\n",
+ btrfs_inode_rdev(eb, ii), btrfs_inode_sequence(eb, ii),
+ btrfs_inode_flags(eb, ii));
+ print_timespec(eb, &ii->atime, "\t\tatime ", "\n");
+ print_timespec(eb, &ii->ctime, "\t\tctime ", "\n");
+ print_timespec(eb, &ii->mtime, "\t\tmtime ", "\n");
+ print_timespec(eb, &ii->otime, "\t\totime ", "\n");
+}
+
+static void print_dir_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_dir_item *di = btrfs_item_ptr(eb, i, struct btrfs_dir_item);
+ u32 cur = 0;
+
+ while (cur < size) {
+ const u32 name_len = btrfs_dir_name_len(eb, di);
+ const u32 data_len = btrfs_dir_data_len(eb, di);
+ const u32 len = sizeof(*di) + name_len + data_len;
+ struct btrfs_key location;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ pr_info("\t\tlocation key (%llu %u %llu) type %d\n",
+ location.objectid, location.type, location.offset,
+ btrfs_dir_ftype(eb, di));
+ pr_info("\t\ttransid %llu data_len %u name_len %u\n",
+ btrfs_dir_transid(eb, di), data_len, name_len);
+ di = (struct btrfs_dir_item *)((char *)di + len);
+ cur += len;
+ }
+}
+
+static void print_inode_ref_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_inode_ref *ref = btrfs_item_ptr(eb, i, struct btrfs_inode_ref);
+ u32 cur = 0;
+
+ while (cur < size) {
+ const u64 index = btrfs_inode_ref_index(eb, ref);
+ const u32 name_len = btrfs_inode_ref_name_len(eb, ref);
+ const u32 len = sizeof(*ref) + name_len;
+
+ pr_info("\t\tindex %llu name_len %u\n", index, name_len);
+ ref = (struct btrfs_inode_ref *)((char *)ref + len);
+ cur += len;
+ }
+}
+
+static void print_inode_extref_item(const struct extent_buffer *eb, int i)
+{
+ const u32 size = btrfs_item_size(eb, i);
+ struct btrfs_inode_extref *extref;
+ u32 cur = 0;
+
+ extref = btrfs_item_ptr(eb, i, struct btrfs_inode_extref);
+ while (cur < size) {
+ const u64 index = btrfs_inode_extref_index(eb, extref);
+ const u32 name_len = btrfs_inode_extref_name_len(eb, extref);
+ const u64 parent = btrfs_inode_extref_parent(eb, extref);
+ const u32 len = sizeof(*extref) + name_len;
+
+ pr_info("\t\tindex %llu parent %llu name_len %u\n",
+ index, parent, name_len);
+ extref = (struct btrfs_inode_extref *)((char *)extref + len);
+ cur += len;
+ }
+}
+
+static void print_dir_log_index_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_dir_log_item *dlog;
+
+ dlog = btrfs_item_ptr(eb, i, struct btrfs_dir_log_item);
+ pr_info("\t\tdir log end %llu\n", btrfs_dir_log_end(eb, dlog));
+}
+
+static void print_extent_csum(const struct extent_buffer *eb, int i)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ const u32 size = btrfs_item_size(eb, i);
+ const u32 csum_bytes = (size / fs_info->csum_size) * fs_info->sectorsize;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(eb, &key, i);
+ pr_info("\t\trange start %llu end %llu length %u\n",
+ key.offset, key.offset + csum_bytes, csum_bytes);
+}
+
+static void print_file_extent_item(const struct extent_buffer *eb, int i)
+{
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ pr_info("\t\tgeneration %llu type %hhu\n",
+ btrfs_file_extent_generation(eb, fi),
+ btrfs_file_extent_type(eb, fi));
+
+ if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ pr_info("\t\tinline extent data size %u ram_bytes %llu compression %hhu\n",
+ btrfs_file_extent_inline_item_len(eb, i),
+ btrfs_file_extent_ram_bytes(eb, fi),
+ btrfs_file_extent_compression(eb, fi));
+ return;
+ }
+
+ pr_info("\t\textent data disk bytenr %llu nr %llu\n",
+ btrfs_file_extent_disk_bytenr(eb, fi),
+ btrfs_file_extent_disk_num_bytes(eb, fi));
+ pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
+ btrfs_file_extent_offset(eb, fi),
+ btrfs_file_extent_num_bytes(eb, fi),
+ btrfs_file_extent_ram_bytes(eb, fi));
+ pr_info("\t\textent compression %hhu\n",
+ btrfs_file_extent_compression(eb, fi));
+}
+
+static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size)
+{
+ static const char *key_to_str[256] = {
+ [BTRFS_INODE_ITEM_KEY] = "INODE_ITEM",
+ [BTRFS_INODE_REF_KEY] = "INODE_REF",
+ [BTRFS_INODE_EXTREF_KEY] = "INODE_EXTREF",
+ [BTRFS_DIR_ITEM_KEY] = "DIR_ITEM",
+ [BTRFS_DIR_INDEX_KEY] = "DIR_INDEX",
+ [BTRFS_DIR_LOG_ITEM_KEY] = "DIR_LOG_ITEM",
+ [BTRFS_DIR_LOG_INDEX_KEY] = "DIR_LOG_INDEX",
+ [BTRFS_XATTR_ITEM_KEY] = "XATTR_ITEM",
+ [BTRFS_VERITY_DESC_ITEM_KEY] = "VERITY_DESC_ITEM",
+ [BTRFS_VERITY_MERKLE_ITEM_KEY] = "VERITY_MERKLE_ITEM",
+ [BTRFS_ORPHAN_ITEM_KEY] = "ORPHAN_ITEM",
+ [BTRFS_ROOT_ITEM_KEY] = "ROOT_ITEM",
+ [BTRFS_ROOT_REF_KEY] = "ROOT_REF",
+ [BTRFS_ROOT_BACKREF_KEY] = "ROOT_BACKREF",
+ [BTRFS_EXTENT_ITEM_KEY] = "EXTENT_ITEM",
+ [BTRFS_METADATA_ITEM_KEY] = "METADATA_ITEM",
+ [BTRFS_TREE_BLOCK_REF_KEY] = "TREE_BLOCK_REF",
+ [BTRFS_SHARED_BLOCK_REF_KEY] = "SHARED_BLOCK_REF",
+ [BTRFS_EXTENT_DATA_REF_KEY] = "EXTENT_DATA_REF",
+ [BTRFS_SHARED_DATA_REF_KEY] = "SHARED_DATA_REF",
+ [BTRFS_EXTENT_OWNER_REF_KEY] = "EXTENT_OWNER_REF",
+ [BTRFS_EXTENT_CSUM_KEY] = "EXTENT_CSUM",
+ [BTRFS_EXTENT_DATA_KEY] = "EXTENT_DATA",
+ [BTRFS_BLOCK_GROUP_ITEM_KEY] = "BLOCK_GROUP_ITEM",
+ [BTRFS_FREE_SPACE_INFO_KEY] = "FREE_SPACE_INFO",
+ [BTRFS_FREE_SPACE_EXTENT_KEY] = "FREE_SPACE_EXTENT",
+ [BTRFS_FREE_SPACE_BITMAP_KEY] = "FREE_SPACE_BITMAP",
+ [BTRFS_CHUNK_ITEM_KEY] = "CHUNK_ITEM",
+ [BTRFS_DEV_ITEM_KEY] = "DEV_ITEM",
+ [BTRFS_DEV_EXTENT_KEY] = "DEV_EXTENT",
+ [BTRFS_TEMPORARY_ITEM_KEY] = "TEMPORARY_ITEM",
+ [BTRFS_DEV_REPLACE_KEY] = "DEV_REPLACE",
+ [BTRFS_STRING_ITEM_KEY] = "STRING_ITEM",
+ [BTRFS_QGROUP_STATUS_KEY] = "QGROUP_STATUS",
+ [BTRFS_QGROUP_RELATION_KEY] = "QGROUP_RELATION",
+ [BTRFS_QGROUP_INFO_KEY] = "QGROUP_INFO",
+ [BTRFS_QGROUP_LIMIT_KEY] = "QGROUP_LIMIT",
+ [BTRFS_PERSISTENT_ITEM_KEY] = "PERSISTENT_ITEM",
+ [BTRFS_UUID_KEY_SUBVOL] = "UUID_KEY_SUBVOL",
+ [BTRFS_UUID_KEY_RECEIVED_SUBVOL] = "UUID_KEY_RECEIVED_SUBVOL",
+ [BTRFS_RAID_STRIPE_KEY] = "RAID_STRIPE",
+ };
+
+ if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+ scnprintf(buf, buf_size, "UNTYPED");
+ else if (key_to_str[key->type])
+ scnprintf(buf, buf_size, key_to_str[key->type]);
+ else
+ scnprintf(buf, buf_size, "UNKNOWN.%d", key->type);
+}
+
void btrfs_print_leaf(const struct extent_buffer *l)
{
struct btrfs_fs_info *fs_info;
int i;
u32 type, nr;
struct btrfs_root_item *ri;
- struct btrfs_dir_item *di;
- struct btrfs_inode_item *ii;
struct btrfs_block_group_item *bi;
- struct btrfs_file_extent_item *fi;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
struct btrfs_dev_extent *dev_extent;
struct btrfs_key key;
- struct btrfs_key found_key;
if (!l)
return;
@@ -255,25 +450,35 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
+ char key_buf[KEY_TYPE_BUF_SIZE];
+
btrfs_item_key_to_cpu(l, &key, i);
type = key.type;
- pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
- i, key.objectid, type, key.offset,
+ key_type_string(&key, key_buf, KEY_TYPE_BUF_SIZE);
+
+ pr_info("\titem %d key (%llu %s %llu) itemoff %d itemsize %d\n",
+ i, key.objectid, key_buf, key.offset,
btrfs_item_offset(l, i), btrfs_item_size(l, i));
switch (type) {
case BTRFS_INODE_ITEM_KEY:
- ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
- pr_info("\t\tinode generation %llu size %llu mode %o\n",
- btrfs_inode_generation(l, ii),
- btrfs_inode_size(l, ii),
- btrfs_inode_mode(l, ii));
+ print_inode_item(l, i);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ print_inode_ref_item(l, i);
+ break;
+ case BTRFS_INODE_EXTREF_KEY:
+ print_inode_extref_item(l, i);
break;
case BTRFS_DIR_ITEM_KEY:
- di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(l, di, &found_key);
- pr_info("\t\tdir oid %llu flags %u\n",
- found_key.objectid,
- btrfs_dir_flags(l, di));
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ print_dir_item(l, i);
+ break;
+ case BTRFS_DIR_LOG_INDEX_KEY:
+ print_dir_log_index_item(l, i);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ print_extent_csum(l, i);
break;
case BTRFS_ROOT_ITEM_KEY:
ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -303,24 +508,7 @@ void btrfs_print_leaf(const struct extent_buffer *l)
btrfs_shared_data_ref_count(l, sref));
break;
case BTRFS_EXTENT_DATA_KEY:
- fi = btrfs_item_ptr(l, i,
- struct btrfs_file_extent_item);
- pr_info("\t\tgeneration %llu type %hhu\n",
- btrfs_file_extent_generation(l, fi),
- btrfs_file_extent_type(l, fi));
- if (btrfs_file_extent_type(l, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- pr_info("\t\tinline extent data size %llu\n",
- btrfs_file_extent_ram_bytes(l, fi));
- break;
- }
- pr_info("\t\textent data disk bytenr %llu nr %llu\n",
- btrfs_file_extent_disk_bytenr(l, fi),
- btrfs_file_extent_disk_num_bytes(l, fi));
- pr_info("\t\textent data offset %llu nr %llu ram %llu\n",
- btrfs_file_extent_offset(l, fi),
- btrfs_file_extent_num_bytes(l, fi),
- btrfs_file_extent_ram_bytes(l, fi));
+ print_file_extent_item(l, i);
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ccaa9a3cf1ce..1175b8192cd7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1069,7 +1069,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
}
path = btrfs_alloc_path();
- if (!path) {
+ if (unlikely(!path)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_root;
@@ -1081,7 +1081,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*ptr));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1111,7 +1111,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
if (ret > 0)
goto out_add_root;
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1129,7 +1129,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
/* We should not have a stray @prealloc pointer. */
ASSERT(prealloc == NULL);
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
- if (!prealloc) {
+ if (unlikely(!prealloc)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_path;
@@ -1137,7 +1137,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1145,13 +1145,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_search_slot_for_read(tree_root, &found_key,
path, 1, 0);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1165,7 +1165,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
}
}
ret = btrfs_next_item(tree_root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1176,7 +1176,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
out_add_root:
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1190,7 +1190,7 @@ out_add_root:
qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -1376,13 +1376,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_free_qgroup_config(fs_info);
ret = btrfs_clean_quota_tree(trans, quota_root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
+ u64 num_bytes_cmpr = src->excl_cmpr;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes_cmpr;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes_cmpr;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
@@ -2424,9 +2426,9 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
int i;
/* Level sanity check */
- if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
- root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
- root_level < cur_level) {
+ if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
+ root_level < cur_level)) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
__func__, cur_level, root_level);
@@ -2442,7 +2444,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
* dst_path->nodes[root_level] must be initialized before
* calling this function.
*/
- if (cur_level == root_level) {
+ if (unlikely(cur_level == root_level)) {
btrfs_err_rl(fs_info,
"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
__func__, root_level, root_level, cur_level);
@@ -2528,7 +2530,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
return 0;
/* Wrong parameter order */
- if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
+ if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
btrfs_header_generation(src_eb),
@@ -2536,7 +2538,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
return -EUCLEAN;
}
- if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) {
ret = -EIO;
goto out;
}
@@ -2727,7 +2729,7 @@ static void qgroup_iterator_nested_clean(struct list_head *head)
*/
static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
struct ulist *roots, struct list_head *qgroups,
- u64 seq, int update_old)
+ u64 seq, bool update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -4708,8 +4710,8 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
- btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot))) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
__func__,
@@ -4841,7 +4843,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
reloc_eb = NULL;
goto free_out;
}
- if (!extent_buffer_uptodate(reloc_eb)) {
+ if (unlikely(!extent_buffer_uptodate(reloc_eb))) {
ret = -EIO;
goto free_out;
}
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index cab0b291088c..cc6f6095cc9f 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -67,7 +67,7 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *stripe_root = fs_info->stripe_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf;
u64 found_start;
@@ -260,7 +260,6 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -269,7 +268,7 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
struct btrfs_stripe_extent *stripe_extent,
const size_t item_size)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
int slot;
@@ -288,7 +287,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
item_size);
- btrfs_free_path(path);
return ret;
}
@@ -306,7 +304,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
int ret;
stripe_extent = kzalloc(item_size, GFP_NOFS);
- if (!stripe_extent) {
+ if (!unlikely(stripe_extent)) {
btrfs_abort_transaction(trans, -ENOMEM);
btrfs_end_transaction(trans);
return -ENOMEM;
@@ -376,7 +374,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
struct btrfs_stripe_extent *stripe_extent;
struct btrfs_key stripe_key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
const u64 end = logical + *length;
int num_stripes;
@@ -402,7 +400,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
if (ret < 0)
- goto free_path;
+ return ret;
if (ret) {
if (path->slots[0] != 0)
path->slots[0]--;
@@ -459,8 +457,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
stripe->physical, devid);
- ret = 0;
- goto free_path;
+ return 0;
}
/* If we're here, we haven't found the requested devid in the stripe. */
@@ -474,8 +471,6 @@ out:
logical, logical + *length, stripe->dev->devid,
btrfs_bg_type_to_raid_name(map_type));
}
-free_path:
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3ff2bedfb3a4..0135dceb7baa 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1167,7 +1167,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
/* Check if we have reached tolerance early. */
found_errors = get_rbio_veritical_errors(rbio, sector_nr,
NULL, NULL);
- if (found_errors > rbio->bioc->max_errors)
+ if (unlikely(found_errors > rbio->bioc->max_errors))
return -EIO;
return 0;
}
@@ -1208,17 +1208,16 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
struct bvec_iter iter = bio->bi_iter;
+ phys_addr_t paddr;
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
rbio->bioc->full_stripe_logical;
- while (iter.bi_size) {
+ btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) {
unsigned int index = (offset >> sectorsize_bits);
struct sector_ptr *sector = &rbio->bio_sectors[index];
- struct bio_vec bv = bio_iter_iovec(bio, iter);
sector->has_paddr = true;
- sector->paddr = bvec_phys(&bv);
- bio_advance_iter_single(bio, &iter, sectorsize);
+ sector->paddr = paddr;
offset += sectorsize;
}
}
@@ -1511,22 +1510,17 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
*/
static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ const u32 blocksize = rbio->bioc->fs_info->sectorsize;
+ phys_addr_t paddr;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct sector_ptr *sector;
- phys_addr_t paddr = bvec_phys(bvec);
+ btrfs_bio_for_each_block_all(paddr, bio, blocksize) {
+ struct sector_ptr *sector = find_stripe_sector(rbio, paddr);
- for (u32 off = 0; off < bvec->bv_len; off += sectorsize) {
- sector = find_stripe_sector(rbio, paddr + off);
- ASSERT(sector);
- if (sector)
- sector->uptodate = 1;
- }
+ ASSERT(sector);
+ if (sector)
+ sector->uptodate = 1;
}
}
@@ -1573,8 +1567,7 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
int total_sector_nr = get_bio_sector_nr(rbio, bio);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ phys_addr_t paddr;
/* No data csum for the whole stripe, no need to verify. */
if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1584,27 +1577,20 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
return;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- void *kaddr;
-
- kaddr = bvec_kmap_local(bvec);
- for (u32 off = 0; off < bvec->bv_len;
- off += fs_info->sectorsize, total_sector_nr++) {
- u8 csum_buf[BTRFS_CSUM_SIZE];
- u8 *expected_csum = rbio->csum_buf +
- total_sector_nr * fs_info->csum_size;
- int ret;
+ btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) {
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
+ int ret;
- /* No csum for this sector, skip to the next sector. */
- if (!test_bit(total_sector_nr, rbio->csum_bitmap))
- continue;
+ /* No csum for this sector, skip to the next sector. */
+ if (!test_bit(total_sector_nr, rbio->csum_bitmap))
+ continue;
- ret = btrfs_check_sector_csum(fs_info, kaddr + off,
- csum_buf, expected_csum);
- if (ret < 0)
- set_bit(total_sector_nr, rbio->error_bitmap);
- }
- kunmap_local(kaddr);
+ ret = btrfs_check_block_csum(fs_info, paddr,
+ csum_buf, expected_csum);
+ if (ret < 0)
+ set_bit(total_sector_nr, rbio->error_bitmap);
+ total_sector_nr++;
}
}
@@ -1802,7 +1788,6 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector;
u8 csum_buf[BTRFS_CSUM_SIZE];
u8 *csum_expected;
- void *kaddr;
int ret;
if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1824,9 +1809,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
csum_expected = rbio->csum_buf +
(stripe_nr * rbio->stripe_nsectors + sector_nr) *
fs_info->csum_size;
- kaddr = kmap_local_sector(sector);
- ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected);
- kunmap_local(kaddr);
+ ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected);
return ret;
}
@@ -1864,7 +1847,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
if (!found_errors)
return 0;
- if (found_errors > rbio->bioc->max_errors)
+ if (unlikely(found_errors > rbio->bioc->max_errors))
return -EIO;
/*
@@ -2416,7 +2399,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
int found_errors;
found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
- if (found_errors > rbio->bioc->max_errors) {
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
break;
}
@@ -2705,7 +2688,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
found_errors = get_rbio_veritical_errors(rbio, sector_nr,
&faila, &failb);
- if (found_errors > rbio->bioc->max_errors) {
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
goto out;
}
@@ -2729,7 +2712,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
* data, so the capability of the repair is declined. (In the
* case of RAID5, we can not repair anything.)
*/
- if (dfail > rbio->bioc->max_errors - 1) {
+ if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
ret = -EIO;
goto out;
}
@@ -2746,7 +2729,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
* scrubbing parity, luckily, use the other one to repair the
* data, or we can not repair the data stripe.
*/
- if (failp != rbio->scrubp) {
+ if (unlikely(failp != rbio->scrubp)) {
ret = -EIO;
goto out;
}
@@ -2837,7 +2820,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio)
int found_errors;
found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
- if (found_errors > rbio->bioc->max_errors) {
+ if (unlikely(found_errors > rbio->bioc->max_errors)) {
ret = -EIO;
break;
}
@@ -2861,19 +2844,22 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
* This is for scrub call sites where we already have correct data contents.
* This allows us to avoid reading data stripes again.
*
- * Unfortunately here we have to do page copy, other than reusing the pages.
+ * Unfortunately here we have to do folio copy, other than reusing the pages.
* This is due to the fact rbio has its own page management for its cache.
*/
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
- struct page **data_pages, u64 data_logical)
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+ struct folio **data_folios, u64 data_logical)
{
+ struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
const u64 offset_in_full_stripe = data_logical -
rbio->bioc->full_stripe_logical;
- const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ unsigned int findex = 0;
+ unsigned int foffset = 0;
int ret;
+ /* We shouldn't hit RAID56 for bs > ps cases for now. */
+ ASSERT(fs_info->sectorsize <= PAGE_SIZE);
+
/*
* If we hit ENOMEM temporarily, but later at
* raid56_parity_submit_scrub_rbio() time it succeeded, we just do
@@ -2890,14 +2876,25 @@ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
- for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
- struct page *dst = rbio->stripe_pages[page_nr + page_index];
- struct page *src = data_pages[page_nr];
+ for (unsigned int cur_off = offset_in_full_stripe;
+ cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
+ cur_off += PAGE_SIZE) {
+ const unsigned int pindex = cur_off >> PAGE_SHIFT;
+ void *kaddr;
+
+ kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
+ memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
+ kunmap_local(kaddr);
- memcpy_page(dst, 0, src, 0, PAGE_SIZE);
- for (int sector_nr = sectors_per_page * page_index;
- sector_nr < sectors_per_page * (page_index + 1);
- sector_nr++)
- rbio->stripe_sectors[sector_nr].uptodate = true;
+ foffset += PAGE_SIZE;
+ ASSERT(foffset <= folio_size(data_folios[findex]));
+ if (foffset == folio_size(data_folios[findex])) {
+ findex++;
+ foffset = 0;
+ }
}
+ for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits;
+ sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits;
+ sector_nr++)
+ rbio->stripe_sectors[sector_nr].uptodate = true;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 0d7b4c2fb6ae..84c4d1d29c7a 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -201,8 +201,8 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
-void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
- struct page **data_pages, u64 data_logical);
+void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
+ struct folio **data_folios, u64 data_logical);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 3871c3a6c743..de4cb0f3fbd0 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -971,7 +971,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int tree_block_level = 0;
u64 bytenr = 0, num_bytes = 0;
@@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
+ extent_root = btrfs_extent_root(fs_info, 0);
+ /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
+ if (IS_ERR(extent_root)) {
+ btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- extent_root = btrfs_extent_root(fs_info, 0);
eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
@@ -1014,6 +1021,5 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 559bd25a2b7a..1ce544d53cc5 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -12,7 +12,7 @@
struct btrfs_fs_info;
struct btrfs_ref;
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+#ifdef CONFIG_BTRFS_DEBUG
#include <linux/spinlock.h>
@@ -53,6 +53,6 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info)
{
}
-#endif /* CONFIG_BTRFS_FS_REF_VERIFY */
+#endif /* CONFIG_BTRFS_DEBUG */
#endif
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index ce25ab7f0e99..5465a5eae9b2 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -23,7 +23,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
u64 endoff,
const u64 destoff,
const u64 olen,
- int no_time_update)
+ bool no_time_update)
{
int ret;
@@ -43,7 +43,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
}
ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -268,12 +268,12 @@ copy_inline_extent:
drop_args.end = aligned_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -285,7 +285,7 @@ copy_inline_extent:
btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
btrfs_set_inode_full_sync(inode);
ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
- if (ret)
+ if (unlikely(ret))
btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
@@ -337,10 +337,10 @@ copy_to_page:
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
+ const u64 destoff, bool no_time_update)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
char *buf = NULL;
@@ -611,7 +611,6 @@ process_slot:
}
out:
- btrfs_free_path(path);
kvfree(buf);
clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7256f6748c8f..8dd8de6b9fb8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -821,7 +821,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
u64 bytenr, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
int ret;
@@ -834,11 +834,9 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ return ret;
+ if (ret > 0)
+ return -ENOENT;
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -849,16 +847,11 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi));
- if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
- ret = -EINVAL;
- goto out;
- }
+ if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi))
+ return -EINVAL;
*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -974,7 +967,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_data_ref(&ref, key.objectid, key.offset,
btrfs_root_id(root), false);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -988,7 +981,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_data_ref(&ref, key.objectid, key.offset,
btrfs_root_id(root), false);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1199,7 +1192,7 @@ again:
ref.ref_root = btrfs_root_id(src);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1212,7 +1205,7 @@ again:
ref.ref_root = btrfs_root_id(dest);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1226,7 +1219,7 @@ again:
ref.ref_root = btrfs_root_id(src);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1240,7 +1233,7 @@ again:
ref.ref_root = btrfs_root_id(dest);
btrfs_init_tree_ref(&ref, level - 1, 0, true);
ret = btrfs_free_extent(trans, &ref);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
break;
}
@@ -1490,7 +1483,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
* ->reloc_root. If it fails however we must
* drop the ref ourselves.
*/
- ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ ret2 = btrfs_drop_snapshot(reloc_root, false, true);
if (ret2 < 0) {
btrfs_put_root(reloc_root);
if (!ret)
@@ -1500,7 +1493,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
- ret2 = btrfs_drop_snapshot(root, 0, 1);
+ ret2 = btrfs_drop_snapshot(root, false, true);
if (ret2 < 0) {
btrfs_put_root(root);
if (!ret)
@@ -1791,7 +1784,7 @@ again:
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
if (!err)
err = ret;
@@ -1960,7 +1953,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root));
return PTR_ERR(root);
}
- if (root->reloc_root != reloc_root) {
+ if (unlikely(root->reloc_root != reloc_root)) {
DEBUG_WARN("unexpected reloc root found");
btrfs_err(fs_info,
"root %llu has two reloc roots associated with it",
@@ -2031,7 +2024,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOENT);
- if (next->new_bytenr) {
+ if (unlikely(next->new_bytenr)) {
/*
* We just created the reloc root, so we shouldn't have
* ->new_bytenr set yet. If it is then we have multiple roots
@@ -2090,7 +2083,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
* This can occur if we have incomplete extent refs leading all
* the way up a particular path, in this case return -EUCLEAN.
*/
- if (!root)
+ if (unlikely(!root))
return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
@@ -2277,7 +2270,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (lowest) {
- if (bytenr != node->bytenr) {
+ if (unlikely(bytenr != node->bytenr)) {
btrfs_err(root->fs_info,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
@@ -2332,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
upper->eb);
- if (ret)
+ if (unlikely(ret))
btrfs_abort_transaction(trans, ret);
}
next:
@@ -2454,7 +2447,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
eb = read_tree_block(fs_info, block->bytenr, &check);
if (IS_ERR(eb))
return PTR_ERR(eb);
- if (!extent_buffer_uptodate(eb)) {
+ if (unlikely(!extent_buffer_uptodate(eb))) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2519,7 +2512,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
- if (node->new_bytenr) {
+ if (unlikely(node->new_bytenr)) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
@@ -2839,7 +2832,7 @@ again:
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = -EIO;
goto release_folio;
}
@@ -3158,7 +3151,7 @@ static int __add_tree_block(struct reloc_control *rc,
struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret;
bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -3186,7 +3179,7 @@ again:
path->skip_locking = 1;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && skinny) {
if (path->slots[0]) {
@@ -3213,14 +3206,10 @@ again:
"tree block extent item (%llu) is not found in extent tree",
bytenr);
WARN_ON(1);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
- ret = add_tree_block(rc, &key, path, blocks);
-out:
- btrfs_free_path(path);
- return ret;
+ return add_tree_block(rc, &key, path, blocks);
}
static int delete_block_group_cache(struct btrfs_block_group *block_group,
@@ -3510,7 +3499,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
u64 flags;
int ret;
@@ -3679,14 +3668,13 @@ out_free:
if (ret < 0 && !err)
err = ret;
btrfs_free_block_rsv(fs_info, rc->block_rsv);
- btrfs_free_path(path);
return err;
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
int ret;
@@ -3697,7 +3685,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
@@ -3707,15 +3695,13 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -3738,7 +3724,6 @@ static void delete_orphan_inode(struct btrfs_trans_handle *trans,
out:
if (ret)
btrfs_abort_transaction(trans, ret);
- btrfs_free_path(path);
}
/*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e22e6b06927a..d07eab70f759 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -85,7 +85,7 @@ int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of the valid range.
*/
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -130,7 +130,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*item)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *l;
int ret;
int slot;
@@ -143,15 +143,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
btrfs_crit(fs_info,
"unable to find root key (%llu %u %llu) in tree %llu",
key->objectid, key->type, key->offset, btrfs_root_id(root));
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
l = path->nodes[0];
@@ -168,22 +168,22 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
l = path->nodes[0];
slot = path->slots[0];
@@ -197,8 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
-out:
- btrfs_free_path(path);
return ret;
}
@@ -216,7 +214,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root;
int err = 0;
@@ -309,7 +307,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
btrfs_put_root(root);
}
- btrfs_free_path(path);
return err;
}
@@ -318,7 +315,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
const struct btrfs_key *key)
{
struct btrfs_root *root = trans->fs_info->tree_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -326,17 +323,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
- goto out;
- if (ret != 0) {
+ return ret;
+ if (unlikely(ret > 0))
/* The root must exist but we did not find it by the key. */
- ret = -EUCLEAN;
- goto out;
- }
+ return -EUCLEAN;
- ret = btrfs_del_item(trans, root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
@@ -344,7 +336,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -361,7 +353,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -369,18 +361,16 @@ again:
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name->len) ||
- memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
- ret = -ENOENT;
- goto out;
- }
+ memcmp_extent_buffer(leaf, name->name, ptr, name->len))
+ return -ENOENT;
+
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out;
+ return ret;
} else {
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
@@ -391,8 +381,6 @@ again:
goto again;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -418,7 +406,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
@@ -433,9 +421,8 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name->len);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
- btrfs_free_path(path);
return ret;
}
@@ -455,7 +442,6 @@ again:
goto again;
}
- btrfs_free_path(path);
return 0;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6776e6ab8d10..4691d0bdb2e8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -113,7 +113,7 @@ enum {
/* Which blocks are covered by extent items. */
scrub_bitmap_nr_has_extent = 0,
- /* Which blocks are meteadata. */
+ /* Which blocks are metadata. */
scrub_bitmap_nr_is_metadata,
/*
@@ -130,7 +130,7 @@ enum {
scrub_bitmap_nr_last,
};
-#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
+#define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE)
/*
* Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
@@ -139,7 +139,7 @@ struct scrub_stripe {
struct scrub_ctx *sctx;
struct btrfs_block_group *bg;
- struct page *pages[SCRUB_STRIPE_PAGES];
+ struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS];
struct scrub_sector_verification *sectors;
struct btrfs_device *dev;
@@ -206,7 +206,7 @@ struct scrub_ctx {
ktime_t throttle_deadline;
u64 throttle_sent;
- int is_dev_replace;
+ bool is_dev_replace;
u64 write_pointer;
struct mutex wr_lock;
@@ -339,10 +339,10 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
if (!stripe)
return;
- for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
- if (stripe->pages[i])
- __free_page(stripe->pages[i]);
- stripe->pages[i] = NULL;
+ for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) {
+ if (stripe->folios[i])
+ folio_put(stripe->folios[i]);
+ stripe->folios[i] = NULL;
}
kfree(stripe->sectors);
kfree(stripe->csums);
@@ -355,6 +355,7 @@ static void release_scrub_stripe(struct scrub_stripe *stripe)
static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
struct scrub_stripe *stripe)
{
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
int ret;
memset(stripe, 0, sizeof(*stripe));
@@ -367,7 +368,9 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
+ ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS);
+ ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
+ fs_info->block_min_order, stripe->folios);
if (ret < 0)
goto error;
@@ -446,7 +449,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx)
}
static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
- struct btrfs_fs_info *fs_info, int is_dev_replace)
+ struct btrfs_fs_info *fs_info, bool is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
@@ -585,7 +588,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
bool is_super, u64 logical, u64 physical)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key found_key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
@@ -612,7 +615,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
if (ret < 0)
- goto out;
+ return;
swarn.extent_item_size = found_key.offset;
@@ -658,9 +661,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
}
-
-out:
- btrfs_free_path(path);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -687,13 +687,30 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
{
- u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits);
- const struct page *page = stripe->pages[offset >> PAGE_SHIFT];
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ u32 offset = (sector_nr << fs_info->sectorsize_bits);
+ const struct folio *folio = stripe->folios[offset >> min_folio_shift];
- /* stripe->pages[] is allocated by us and no highmem is allowed. */
- ASSERT(page);
- ASSERT(!PageHighMem(page));
- return page_address(page) + offset_in_page(offset);
+ /* stripe->folios[] is allocated by us and no highmem is allowed. */
+ ASSERT(folio);
+ ASSERT(!folio_test_partial_kmap(folio));
+ return folio_address(folio) + offset_in_folio(folio, offset);
+}
+
+static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ u32 offset = (sector_nr << fs_info->sectorsize_bits);
+ const struct folio *folio = stripe->folios[offset >> min_folio_shift];
+
+ /* stripe->folios[] is allocated by us and no highmem is allowed. */
+ ASSERT(folio);
+ ASSERT(!folio_test_partial_kmap(folio));
+ /* And the range must be contained inside the folio. */
+ ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));
+ return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset);
}
static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
@@ -788,7 +805,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
- void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr);
u8 csum_buf[BTRFS_CSUM_SIZE];
int ret;
@@ -833,7 +850,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
return;
}
- ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum);
+ ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum);
if (ret < 0) {
scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
scrub_bitmap_set_bit_error(stripe, sector_nr);
@@ -1369,8 +1386,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* Slice is divided into intervals when the IO is submitted, adjust by
* bwlimit and maximum of 64 intervals.
*/
- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
- div = min_t(u32, 64, div);
+ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
/* Start new epoch, set deadline */
now = ktime_get();
@@ -1513,7 +1529,7 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
@@ -1859,6 +1875,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
@@ -1871,7 +1888,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
return;
}
- bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+ bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
@@ -1970,7 +1987,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
* metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
- if (stripe_has_metadata_error(&sctx->stripes[i])) {
+ if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) {
ret = -EIO;
goto out;
}
@@ -2164,7 +2181,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
* As we may hit an empty data stripe while it's missing.
*/
bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
- if (!bitmap_empty(&error, stripe->nr_sectors)) {
+ if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) {
btrfs_err(fs_info,
"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
@@ -2202,7 +2219,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
for (int i = 0; i < data_stripes; i++) {
stripe = &sctx->raid56_data_stripes[i];
- raid56_parity_cache_data_pages(rbio, stripe->pages,
+ raid56_parity_cache_data_folios(rbio, stripe->folios,
full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
}
raid56_parity_submit_scrub_rbio(rbio);
@@ -2586,7 +2603,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
u64 chunk_offset;
@@ -2858,8 +2875,8 @@ skip_unfreeze:
btrfs_put_block_group(cache);
if (ret)
break;
- if (sctx->is_dev_replace &&
- atomic64_read(&dev_replace->num_write_errors) > 0) {
+ if (unlikely(sctx->is_dev_replace &&
+ atomic64_read(&dev_replace->num_write_errors) > 0)) {
ret = -EIO;
break;
}
@@ -2872,8 +2889,6 @@ skip:
btrfs_release_path(path);
}
- btrfs_free_path(path);
-
return ret;
}
@@ -2889,13 +2904,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
if (ret < 0)
return ret;
ret = btrfs_check_super_csum(fs_info, sb);
- if (ret != 0) {
+ if (unlikely(ret != 0)) {
btrfs_err_rl(fs_info,
"scrub: super block at physical %llu devid %llu has bad csum",
physical, dev->devid);
return -EIO;
}
- if (btrfs_super_generation(sb) != generation) {
+ if (unlikely(btrfs_super_generation(sb) != generation)) {
btrfs_err_rl(fs_info,
"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
physical, dev->devid,
@@ -3013,7 +3028,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace)
+ bool readonly, bool is_dev_replace)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
@@ -3065,8 +3080,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
mutex_lock(&fs_info->scrub_lock);
- if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
+ if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+ test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h
index f0df597b75c7..aa68b6ebaf55 100644
--- a/fs/btrfs/scrub.h
+++ b/fs/btrfs/scrub.h
@@ -11,7 +11,7 @@ struct btrfs_scrub_progress;
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
- int readonly, int is_dev_replace);
+ bool readonly, bool is_dev_replace);
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7664025a5af4..9230e5066fc6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -646,7 +646,7 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
ret = kernel_write(filp, buf + pos, len - pos, off);
if (ret < 0)
return ret;
- if (ret == 0)
+ if (unlikely(ret == 0))
return -EIO;
pos += ret;
}
@@ -909,7 +909,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
struct btrfs_inode_info *info)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_item *ii;
struct btrfs_key key;
@@ -924,11 +924,11 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto out;
+ return ret;
}
if (!info)
- goto out;
+ return 0;
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
@@ -945,9 +945,7 @@ static int get_inode_info(struct btrfs_root *root, u64 ino,
*/
info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
@@ -973,13 +971,13 @@ typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
* path must point to the INODE_REF or INODE_EXTREF when called.
*/
static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *found_key, int resolve,
+ struct btrfs_key *found_key, bool resolve,
iterate_inode_ref_t iterate, void *ctx)
{
struct extent_buffer *eb = path->nodes[0];
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
- struct btrfs_path *tmp_path;
+ BTRFS_PATH_AUTO_FREE(tmp_path);
struct fs_path *p;
u32 cur = 0;
u32 total;
@@ -1076,7 +1074,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- btrfs_free_path(tmp_path);
fs_path_free(p);
return ret;
}
@@ -1224,7 +1221,7 @@ static int get_inode_path(struct btrfs_root *root,
{
int ret;
struct btrfs_key key, found_key;
- struct btrfs_path *p;
+ BTRFS_PATH_AUTO_FREE(p);
p = alloc_path_for_send();
if (!p)
@@ -1238,28 +1235,20 @@ static int get_inode_path(struct btrfs_root *root,
ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 1;
- goto out;
- }
+ return ret;
+ if (ret)
+ return 1;
+
btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
if (found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = -ENOENT;
- goto out;
- }
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ return -ENOENT;
- ret = iterate_inode_ref(root, p, &found_key, 1,
- __copy_first_ref, path);
+ ret = iterate_inode_ref(root, p, &found_key, true, __copy_first_ref, path);
if (ret < 0)
- goto out;
- ret = 0;
-
-out:
- btrfs_free_path(p);
- return ret;
+ return ret;
+ return 0;
}
struct backref_ctx {
@@ -1389,7 +1378,7 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
struct backref_ctx *bctx = ctx;
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
- const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+ const u64 key = leaf_bytenr >> fs_info->nodesize_bits;
struct btrfs_lru_cache_entry *raw_entry;
struct backref_cache_entry *entry;
@@ -1444,7 +1433,7 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
if (!new_entry)
return;
- new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
+ new_entry->entry.key = leaf_bytenr >> fs_info->nodesize_bits;
new_entry->entry.gen = 0;
new_entry->num_roots = 0;
ULIST_ITER_INIT(&uiter);
@@ -1716,7 +1705,7 @@ static int read_symlink(struct btrfs_root *root,
struct fs_path *dest)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_file_extent_item *ei;
u8 type;
@@ -1733,21 +1722,20 @@ static int read_symlink(struct btrfs_root *root,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret) {
+ return ret;
+ if (unlikely(ret)) {
/*
* An empty symlink inode. Can happen in rare error paths when
* creating a symlink (transaction committed before the inode
* eviction handler removed the symlink inode items and a crash
- * happened in between or the subvol was snapshoted in between).
+ * happened in between or the subvol was snapshotted in between).
* Print an informative message to dmesg/syslog so that the user
* can delete the symlink.
*/
btrfs_err(root->fs_info,
"Found empty symlink inode %llu at root %llu",
ino, btrfs_root_id(root));
- ret = -EIO;
- goto out;
+ return -EIO;
}
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1758,7 +1746,7 @@ static int read_symlink(struct btrfs_root *root,
btrfs_crit(root->fs_info,
"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
ino, btrfs_root_id(root), type);
- goto out;
+ return ret;
}
compression = btrfs_file_extent_compression(path->nodes[0], ei);
if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
@@ -1766,17 +1754,13 @@ static int read_symlink(struct btrfs_root *root,
btrfs_crit(root->fs_info,
"send: found symlink extent with compression, ino %llu root %llu compression type %d",
ino, btrfs_root_id(root), compression);
- goto out;
+ return ret;
}
off = btrfs_file_extent_inline_start(ei);
len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
}
/*
@@ -1787,8 +1771,7 @@ static int gen_unique_name(struct send_ctx *sctx,
u64 ino, u64 gen,
struct fs_path *dest)
{
- int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
char tmp[64];
int len;
@@ -1811,10 +1794,9 @@ static int gen_unique_name(struct send_ctx *sctx,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+
if (di) {
/* not unique, try again */
idx++;
@@ -1823,7 +1805,6 @@ static int gen_unique_name(struct send_ctx *sctx,
if (!sctx->parent_root) {
/* unique */
- ret = 0;
break;
}
@@ -1831,10 +1812,9 @@ static int gen_unique_name(struct send_ctx *sctx,
path, BTRFS_FIRST_FREE_OBJECTID,
&tmp_name, 0);
btrfs_release_path(path);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+
if (di) {
/* not unique, try again */
idx++;
@@ -1844,11 +1824,7 @@ static int gen_unique_name(struct send_ctx *sctx,
break;
}
- ret = fs_path_add(dest, tmp, len);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return fs_path_add(dest, tmp, len);
}
enum inode_state {
@@ -1960,7 +1936,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
int ret = 0;
struct btrfs_dir_item *di;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
@@ -1968,19 +1944,15 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
- if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto out;
- }
+ if (IS_ERR_OR_NULL(di))
+ return di ? PTR_ERR(di) : -ENOENT;
+
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
- if (key.type == BTRFS_ROOT_ITEM_KEY) {
- ret = -ENOENT;
- goto out;
- }
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ return -ENOENT;
+
*found_inode = key.objectid;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -1994,7 +1966,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int len;
u64 parent_dir;
@@ -2008,16 +1980,14 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
if (ret < 0)
- goto out;
+ return ret;
if (!ret)
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (ret || found_key.objectid != ino ||
(found_key.type != BTRFS_INODE_REF_KEY &&
- found_key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = -ENOENT;
- goto out;
- }
+ found_key.type != BTRFS_INODE_EXTREF_KEY))
+ return -ENOENT;
if (found_key.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
@@ -2038,19 +2008,17 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
}
if (ret < 0)
- goto out;
+ return ret;
btrfs_release_path(path);
if (dir_gen) {
ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
- goto out;
+ return ret;
}
*dir = parent_dir;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -2486,7 +2454,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_root *parent_root = sctx->parent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
@@ -2498,10 +2466,8 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
- if (!name) {
- btrfs_free_path(path);
+ if (!name)
return -ENOMEM;
- }
key.objectid = btrfs_root_id(send_root);
key.type = BTRFS_ROOT_BACKREF_KEY;
@@ -2564,7 +2530,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
tlv_put_failure:
out:
- btrfs_free_path(path);
kfree(name);
return ret;
}
@@ -2715,7 +2680,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
int ret = 0;
struct fs_path *p = NULL;
struct btrfs_inode_item *ii;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
struct btrfs_key key;
int slot;
@@ -2759,7 +2724,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
tlv_put_failure:
out:
free_path_for_command(sctx, p);
- btrfs_free_path(path);
return ret;
}
@@ -2769,7 +2733,7 @@ out:
* processing an inode that is a directory and it just got renamed, and existing
* entries in the cache may refer to inodes that have the directory in their
* full path - in which case we would generate outdated paths (pre-rename)
- * for the inodes that the cache entries point to. Instead of prunning the
+ * for the inodes that the cache entries point to. Instead of pruning the
* cache when inserting, do it after we finish processing each inode at
* finish_inode_if_needed().
*/
@@ -2930,7 +2894,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
int iter_ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
@@ -2970,7 +2934,6 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
@@ -3750,7 +3713,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key di_key;
struct btrfs_dir_item *di;
@@ -3771,19 +3734,15 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 0;
di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len);
- if (!di) {
- ret = 0;
- goto out;
- }
+ if (!di)
+ return 0;
/*
* di_key.objectid has the number of the inode that has a dentry in the
* parent directory with the same name that sctx->cur_ino is being
@@ -3793,26 +3752,22 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
* that it happens after that other inode is renamed.
*/
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
- if (di_key.type != BTRFS_INODE_ITEM_KEY) {
- ret = 0;
- goto out;
- }
+ if (di_key.type != BTRFS_INODE_ITEM_KEY)
+ return 0;
ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
- goto out;
+ return ret;
ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
- goto out;
+ return ret;
}
/* Different inode, no need to delay the rename of sctx->cur_ino */
- if (right_gen != left_gen) {
- ret = 0;
- goto out;
- }
+ if (right_gen != left_gen)
+ return 0;
wdm = get_waiting_dir_move(sctx, di_key.objectid);
if (wdm && !wdm->orphanized) {
@@ -3826,8 +3781,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
if (!ret)
ret = 1;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3877,7 +3830,7 @@ static int is_ancestor(struct btrfs_root *root,
bool free_fs_path = false;
int ret = 0;
int iter_ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
if (!fs_path) {
@@ -3945,7 +3898,6 @@ static int is_ancestor(struct btrfs_root *root,
ret = iter_ret;
out:
- btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
return ret;
@@ -4756,8 +4708,8 @@ static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ false, record_new_ref_if_needed, sctx);
if (ret < 0)
return ret;
@@ -4768,9 +4720,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, record_deleted_ref_if_needed,
- sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+ false, record_deleted_ref_if_needed, sctx);
if (ret < 0)
return ret;
@@ -4781,12 +4732,12 @@ static int record_changed_ref(struct send_ctx *sctx)
{
int ret;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key,
+ false, record_new_ref_if_needed, sctx);
if (ret < 0)
return ret;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key,
+ false, record_deleted_ref_if_needed, sctx);
if (ret < 0)
return ret;
@@ -4803,7 +4754,7 @@ static int process_all_refs(struct send_ctx *sctx,
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
iterate_inode_ref_t cb;
@@ -4822,8 +4773,7 @@ static int process_all_refs(struct send_ctx *sctx,
} else {
btrfs_err(sctx->send_root->fs_info,
"Wrong command %d in process_all_refs", cmd);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
key.objectid = sctx->cmp_key->objectid;
@@ -4835,15 +4785,14 @@ static int process_all_refs(struct send_ctx *sctx,
found_key.type != BTRFS_INODE_EXTREF_KEY))
break;
- ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
+ ret = iterate_inode_ref(root, path, &found_key, false, cb, sctx);
if (ret < 0)
- goto out;
+ return ret;
}
/* Catch error found during iteration */
- if (iter_ret < 0) {
- ret = iter_ret;
- goto out;
- }
+ if (iter_ret < 0)
+ return iter_ret;
+
btrfs_release_path(path);
/*
@@ -4851,10 +4800,7 @@ static int process_all_refs(struct send_ctx *sctx,
* re-creating this inode and will be rename'ing it into place once we
* rename the parent directory.
*/
- ret = process_recorded_refs(sctx, &pending_move);
-out:
- btrfs_free_path(path);
- return ret;
+ return process_recorded_refs(sctx, &pending_move);
}
static int send_set_xattr(struct send_ctx *sctx,
@@ -5080,7 +5026,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
@@ -5108,7 +5054,6 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
@@ -5254,7 +5199,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
folio_lock(folio);
- if (!folio_test_uptodate(folio)) {
+ if (unlikely(!folio_test_uptodate(folio))) {
folio_unlock(folio);
btrfs_err(fs_info,
"send: IO error at offset %llu for inode %llu root %llu",
@@ -5656,7 +5601,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+ /*
+ * Do not go through encoded read for bs > ps cases.
+ *
+ * Encoded send is using vmallocated pages as buffer, which we can
+ * not ensure every folio is large enough to contain a block.
+ */
+ if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE &&
+ (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
BTRFS_FILE_EXTENT_INLINE);
@@ -5766,7 +5718,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
*/
static int send_capabilities(struct send_ctx *sctx)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct extent_buffer *leaf;
unsigned long data_ptr;
@@ -5804,7 +5756,6 @@ static int send_capabilities(struct send_ctx *sctx)
strlen(XATTR_NAME_CAPS), buf, buf_len);
out:
kfree(buf);
- btrfs_free_path(path);
return ret;
}
@@ -5812,7 +5763,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
struct clone_root *clone_root, const u64 disk_byte,
u64 data_offset, u64 offset, u64 len)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret;
struct btrfs_inode_info info;
@@ -5848,7 +5799,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
- goto out;
+ return ret;
clone_src_i_size = info.size;
/*
@@ -5878,7 +5829,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
key.offset = clone_root->offset;
ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
if (key.objectid == clone_root->ino &&
@@ -5899,7 +5850,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
if (ret < 0)
- goto out;
+ return ret;
else if (ret > 0)
break;
continue;
@@ -5936,7 +5887,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_extent_data(sctx, dst_path, offset,
hole_len);
if (ret < 0)
- goto out;
+ return ret;
len -= hole_len;
if (len == 0)
@@ -6007,7 +5958,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, slen,
clone_root);
if (ret < 0)
- goto out;
+ return ret;
}
ret = send_extent_data(sctx, dst_path,
offset + slen,
@@ -6041,7 +5992,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
}
if (ret < 0)
- goto out;
+ return ret;
len -= clone_len;
if (len == 0)
@@ -6072,8 +6023,6 @@ next:
ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
-out:
- btrfs_free_path(path);
return ret;
}
@@ -6162,7 +6111,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
{
int ret = 0;
struct btrfs_key key;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *eb;
int slot;
struct btrfs_key found_key;
@@ -6188,10 +6137,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
left_type = btrfs_file_extent_type(eb, ei);
- if (left_type != BTRFS_FILE_EXTENT_REG) {
- ret = 0;
- goto out;
- }
+ if (left_type != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
left_len = btrfs_file_extent_num_bytes(eb, ei);
left_offset = btrfs_file_extent_offset(eb, ei);
@@ -6223,11 +6171,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
key.offset = ekey->offset;
ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ if (ret)
+ return 0;
/*
* Handle special case where the right side has no extents at all.
@@ -6236,11 +6182,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
- found_key.type != key.type) {
+ found_key.type != key.type)
/* If we're a hole then just pretend nothing changed */
- ret = (left_disknr) ? 0 : 1;
- goto out;
- }
+ return (left_disknr ? 0 : 1);
/*
* We're now on 2a, 2b or 7.
@@ -6250,10 +6194,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
if (right_type != BTRFS_FILE_EXTENT_REG &&
- right_type != BTRFS_FILE_EXTENT_INLINE) {
- ret = 0;
- goto out;
- }
+ right_type != BTRFS_FILE_EXTENT_INLINE)
+ return 0;
if (right_type == BTRFS_FILE_EXTENT_INLINE) {
right_len = btrfs_file_extent_ram_bytes(eb, ei);
@@ -6266,11 +6208,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* Are we at extent 8? If yes, we know the extent is changed.
* This may only happen on the first iteration.
*/
- if (found_key.offset + right_len <= ekey->offset) {
+ if (found_key.offset + right_len <= ekey->offset)
/* If we're a hole just pretend nothing changed */
- ret = (left_disknr) ? 0 : 1;
- goto out;
- }
+ return (left_disknr ? 0 : 1);
/*
* We just wanted to see if when we have an inline extent, what
@@ -6280,10 +6220,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
* compressed extent representing data with a size matching
* the page size (currently the same as sector size).
*/
- if (right_type == BTRFS_FILE_EXTENT_INLINE) {
- ret = 0;
- goto out;
- }
+ if (right_type == BTRFS_FILE_EXTENT_INLINE)
+ return 0;
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
right_offset = btrfs_file_extent_offset(eb, ei);
@@ -6303,17 +6241,15 @@ static int is_extent_unchanged(struct send_ctx *sctx,
*/
if (left_disknr != right_disknr ||
left_offset_fixed != right_offset ||
- left_gen != right_gen) {
- ret = 0;
- goto out;
- }
+ left_gen != right_gen)
+ return 0;
/*
* Go to the next extent.
*/
ret = btrfs_next_item(sctx->parent_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -6324,10 +6260,9 @@ static int is_extent_unchanged(struct send_ctx *sctx,
key.offset += right_len;
break;
}
- if (found_key.offset != key.offset + right_len) {
- ret = 0;
- goto out;
- }
+ if (found_key.offset != key.offset + right_len)
+ return 0;
+
key = found_key;
}
@@ -6340,15 +6275,12 @@ static int is_extent_unchanged(struct send_ctx *sctx,
else
ret = 0;
-
-out:
- btrfs_free_path(path);
return ret;
}
static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = sctx->send_root;
struct btrfs_key key;
int ret;
@@ -6364,15 +6296,13 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
key.offset = offset;
ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
- goto out;
+ return ret;
sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -6380,7 +6310,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
const u64 start,
const u64 end)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = sctx->parent_root;
u64 search_start = start;
@@ -6395,7 +6325,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
key.offset = search_start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
@@ -6408,8 +6338,8 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6431,15 +6361,11 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
search_start = extent_end;
goto next;
}
- ret = 0;
- goto out;
+ return 0;
next:
path->slots[0]++;
}
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
+ return 1;
}
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
@@ -6547,7 +6473,7 @@ static int process_all_extents(struct send_ctx *sctx)
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
@@ -6574,11 +6500,10 @@ static int process_all_extents(struct send_ctx *sctx)
if (iter_ret < 0)
ret = iter_ret;
- btrfs_free_path(path);
return ret;
}
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, bool at_end,
int *pending_move,
int *refs_processed)
{
@@ -6601,7 +6526,7 @@ out:
return ret;
}
-static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
+static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end)
{
int ret = 0;
struct btrfs_inode_info info;
@@ -7036,7 +6961,7 @@ static int changed_ref(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
inconsistent_snapshot_error(sctx, result, "reference");
return -EIO;
}
@@ -7064,7 +6989,7 @@ static int changed_xattr(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ if (unlikely(sctx->cur_ino != sctx->cmp_key->objectid)) {
inconsistent_snapshot_error(sctx, result, "xattr");
return -EIO;
}
@@ -7304,7 +7229,7 @@ static int search_key_again(const struct send_ctx *sctx,
*/
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
ASSERT(ret <= 0);
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
btrfs_print_tree(path->nodes[path->lowest_level], false);
btrfs_err(root->fs_info,
"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
@@ -7324,7 +7249,7 @@ static int full_send_tree(struct send_ctx *sctx)
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_fs_info *fs_info = send_root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = alloc_path_for_send();
if (!path)
@@ -7341,7 +7266,7 @@ static int full_send_tree(struct send_ctx *sctx)
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
goto out_finish;
@@ -7351,7 +7276,7 @@ static int full_send_tree(struct send_ctx *sctx)
ret = changed_cb(path, NULL, &key,
BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
- goto out;
+ return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -7370,14 +7295,14 @@ static int full_send_tree(struct send_ctx *sctx)
btrfs_release_path(path);
ret = search_key_again(sctx, send_root, path, &key);
if (ret < 0)
- goto out;
+ return ret;
} else {
up_read(&fs_info->commit_root_sem);
}
ret = btrfs_next_item(send_root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
ret = 0;
break;
@@ -7385,11 +7310,7 @@ static int full_send_tree(struct send_ctx *sctx)
}
out_finish:
- ret = finish_inode_if_needed(sctx, 1);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return finish_inode_if_needed(sctx, 1);
}
static int replace_node_with_clone(struct btrfs_path *path, int level)
@@ -7644,8 +7565,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
int cmp;
- struct btrfs_path *left_path = NULL;
- struct btrfs_path *right_path = NULL;
+ BTRFS_PATH_AUTO_FREE(left_path);
+ BTRFS_PATH_AUTO_FREE(right_path);
struct btrfs_key left_key;
struct btrfs_key right_key;
char *tmp_buf = NULL;
@@ -7918,8 +7839,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
out_unlock:
up_read(&fs_info->commit_root_sem);
out:
- btrfs_free_path(left_path);
- btrfs_free_path(right_path);
kvfree(tmp_buf);
return ret;
}
@@ -7986,7 +7905,7 @@ static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
}
/*
- * Make sure any existing dellaloc is flushed for any root used by a send
+ * Make sure any existing delalloc is flushed for any root used by a send
* operation so that we do not miss any data and we do not race with writeback
* finishing and changing a tree while send is using the tree. This could
* happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 0481c693ac2e..97452fb5d29b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -479,7 +479,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
/*
* On the zoned mode, we always allocate one zone as one chunk.
- * Returning non-zone size alingned bytes here will result in
+ * Returning non-zone size aligned bytes here will result in
* less pressure for the async metadata reclaim process, and it
* will over-commit too much leading to ENOSPC. Align down to the
* zone size to avoid that.
@@ -1528,7 +1528,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
* turned into error mode due to a transaction abort when flushing space
* above, in that case fail with the abort error instead of returning
* success to the caller if we can steal from the global rsv - this is
- * just to have caller fail immeditelly instead of later when trying to
+ * just to have caller fail immediately instead of later when trying to
* modify the fs, making it easier to debug -ENOSPC problems.
*/
if (BTRFS_FS_ERROR(fs_info)) {
@@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq, async_work);
+ queue_work(system_dfl_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
need_preemptive_reclaim(fs_info, space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
- queue_work(system_unbound_wq,
+ queue_work(system_dfl_wq,
&fs_info->preempt_reclaim_work);
}
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index cb4f97833dc3..5ca8d4db6722 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -690,7 +690,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
\
GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
btrfs_warn(fs_info, \
- "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
start, len, folio_pos(folio), \
blocks_per_folio, &bitmap); \
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index ee0710eb13fd..ad0552db7c7d 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -13,7 +13,7 @@ struct address_space;
struct folio;
/*
- * Extra info for subpapge bitmap.
+ * Extra info for subpage bitmap.
*
* For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into
* one larger bitmap.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a262b494a89f..d6e496436539 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -133,9 +133,8 @@ enum {
Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
+ Opt_ref_tracker,
#endif
Opt_err,
};
@@ -257,8 +256,7 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ fsparam_flag("ref_tracker", Opt_ref_tracker),
fsparam_flag("ref_verify", Opt_ref_verify),
#endif
{}
@@ -276,6 +274,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
const struct fs_parameter *param, int opt)
{
const char *string = param->string;
+ int ret;
/*
* Provide the same semantics as older kernels that don't use fs
@@ -294,21 +293,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
} else if (btrfs_match_compress_type(string, "zlib", true)) {
ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- string + 4);
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (btrfs_match_compress_type(string, "lzo", false)) {
+ } else if (btrfs_match_compress_type(string, "lzo", true)) {
ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ if (string[3] == ':' && string[4])
+ btrfs_warn(NULL, "Compression level ignored for LZO");
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
} else if (btrfs_match_compress_type(string, "zstd", true)) {
ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- string + 4);
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
@@ -319,10 +327,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_clear_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
} else {
- btrfs_err(NULL, "unrecognized compression value %s", string);
- return -EINVAL;
+ ret = -EINVAL;
+ goto error;
}
return 0;
+error:
+ btrfs_err(NULL, "failed to parse compression option '%s'", string);
+ return ret;
+
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -632,11 +644,12 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
break;
-#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
case Opt_ref_verify:
btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
break;
+ case Opt_ref_tracker:
+ btrfs_set_opt(ctx->mount_opt, REF_TRACKER);
+ break;
#endif
default:
btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
@@ -912,7 +925,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key location;
struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
@@ -929,7 +942,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
if (IS_ERR(di)) {
- btrfs_free_path(path);
return PTR_ERR(di);
}
if (!di) {
@@ -938,13 +950,11 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
* it's always been there, but don't freak out, just try and
* mount the top-level subvolume.
*/
- btrfs_free_path(path);
*objectid = BTRFS_FS_TREE_OBJECTID;
return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- btrfs_free_path(path);
*objectid = location.objectid;
return 0;
}
@@ -1079,7 +1089,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
- if (info->compress_level)
+ if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
@@ -1142,6 +1152,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
+ if (btrfs_test_opt(info, REF_TRACKER))
+ seq_puts(seq, ",ref_tracker");
seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
@@ -1268,7 +1280,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
/*
- * We need to cleanup all defragable inodes if the autodefragment is
+ * We need to cleanup all defraggable inodes if the autodefragment is
* close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
@@ -2260,10 +2272,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
device = btrfs_scan_one_device(vol->name, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
- if (IS_ERR(device))
- ret = PTR_ERR(device);
- else
- ret = 0;
+ ret = PTR_ERR_OR_ZERO(device);
break;
}
ret = !(device->fs_devices->num_devices ==
@@ -2316,14 +2325,14 @@ static int check_dev_super(struct btrfs_device *dev)
/* Verify the checksum. */
csum_type = btrfs_super_csum_type(sb);
- if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ if (unlikely(csum_type != btrfs_super_csum_type(fs_info->super_copy))) {
btrfs_err(fs_info, "csum type changed, has %u expect %u",
csum_type, btrfs_super_csum_type(fs_info->super_copy));
ret = -EUCLEAN;
goto out;
}
- if (btrfs_check_super_csum(fs_info, sb)) {
+ if (unlikely(btrfs_check_super_csum(fs_info, sb))) {
btrfs_err(fs_info, "csum for on-disk super block no longer matches");
ret = -EUCLEAN;
goto out;
@@ -2335,7 +2344,7 @@ static int check_dev_super(struct btrfs_device *dev)
goto out;
last_trans = btrfs_get_last_trans_committed(fs_info);
- if (btrfs_super_generation(sb) != last_trans) {
+ if (unlikely(btrfs_super_generation(sb) != last_trans)) {
btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
btrfs_super_generation(sb), last_trans);
ret = -EUCLEAN;
@@ -2472,9 +2481,6 @@ static int __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
-#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- ", ref-verify=on"
-#endif
#ifdef CONFIG_BLK_DEV_ZONED
", zoned=yes"
#else
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9d398f7a36ad..81f52c1f55ce 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -409,13 +409,17 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
char *buf)
{
ssize_t ret = 0;
+ bool has_output = false;
- if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE)
- ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE);
- if (PAGE_SIZE > SZ_4K)
- ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
- ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
-
+ for (u32 cur = BTRFS_MIN_BLOCKSIZE; cur <= BTRFS_MAX_BLOCKSIZE; cur *= 2) {
+ if (!btrfs_supported_blocksize(cur))
+ continue;
+ if (has_output)
+ ret += sysfs_emit_at(buf, ret, " ");
+ ret += sysfs_emit_at(buf, ret, "%u", cur);
+ has_output = true;
+ }
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_sectorsizes,
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
index 265370e79a54..e2248acb906b 100644
--- a/fs/btrfs/tests/delayed-refs-tests.c
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -997,12 +997,12 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
ret = simple_tests(&trans);
if (!ret) {
- test_msg("running delayed refs merg tests on metadata refs");
+ test_msg("running delayed refs merge tests on metadata refs");
ret = merge_tests(&trans, BTRFS_REF_METADATA);
}
if (!ret) {
- test_msg("running delayed refs merg tests on data refs");
+ test_msg("running delayed refs merge tests on data refs");
ret = merge_tests(&trans, BTRFS_REF_DATA);
}
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 3a86534c116f..42af6c737c6e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -1095,7 +1095,7 @@ int btrfs_test_extent_map(void)
/*
* Test a chunk with 2 data stripes one of which
* intersects the physical address of the super block
- * is correctly recognised.
+ * is correctly recognized.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
.physical_start = SZ_64M - SZ_4M,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c5c0d9cf1a80..89ae0c7a610a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -103,7 +103,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
* | attached to transid N+1. |
* | |
* | To next stage: |
- * | Until all tree blocks are super blocks are |
+ * | Until all tree blocks and super blocks are |
* | written to block devices |
* V |
* Transaction N [[TRANS_STATE_COMPLETED]] V
@@ -404,7 +404,7 @@ loop:
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- int force)
+ bool force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
@@ -1569,7 +1569,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* qgroup counters could end up wrong.
*/
ret = btrfs_run_delayed_refs(trans, U64_MAX);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1641,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
struct btrfs_inode *parent_inode = pending->dir;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
struct extent_buffer *old;
@@ -1694,10 +1694,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto clear_skip_qgroup;
}
- key.objectid = objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
trans->bytes_reserved = trans->block_rsv->reserved;
@@ -1714,7 +1710,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* insert the directory item
*/
ret = btrfs_set_inode_index(parent_inode, &index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1735,7 +1731,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_create_qgroup(trans, objectid);
if (ret && ret != -EEXIST) {
- if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
+ if (unlikely(ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info))) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1748,13 +1744,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* snapshot
*/
ret = btrfs_run_delayed_items(trans);
- if (ret) { /* Transaction aborted */
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = record_root_in_trans(trans, root, 0);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1789,7 +1785,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
old = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
BTRFS_NESTING_COW);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
btrfs_abort_transaction(trans, ret);
@@ -1800,21 +1796,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_wmb();
+ smp_mb__after_atomic();
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = trans->transid;
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1826,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_root_id(parent_root),
btrfs_ino(parent_inode), index,
&fname.disk_name);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1841,7 +1839,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
ret = btrfs_reloc_post_snapshot(trans, pending);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1864,7 +1862,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
parent_inode, &key, BTRFS_FT_DIR,
index);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1874,14 +1872,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
inode_set_mtime_to_ts(&parent_inode->vfs_inode,
inode_set_ctime_current(&parent_inode->vfs_inode));
ret = btrfs_update_inode_fallback(trans, parent_inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
BTRFS_UUID_KEY_SUBVOL,
objectid);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1889,7 +1887,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
- if (ret && ret != -EEXIST) {
+ if (unlikely(ret && ret != -EEXIST)) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1907,7 +1905,6 @@ free_fname:
free_pending:
kfree(new_root_item);
pending->root_item = NULL;
- btrfs_free_path(path);
pending->path = NULL;
return ret;
@@ -2423,7 +2420,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* them.
*
* We needn't worry that this operation will corrupt the snapshots,
- * because all the tree which are snapshoted will be forced to COW
+ * because all the tree which are snapshotted will be forced to COW
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
@@ -2657,9 +2654,9 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, 0, 0);
+ ret = btrfs_drop_snapshot(root, false, false);
else
- ret = btrfs_drop_snapshot(root, 1, 0);
+ ret = btrfs_drop_snapshot(root, true, false);
btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0f556f4de3f9..ca30b15ea452 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf,
/* Only these key->types needs to be checked */
ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY ||
key->type == BTRFS_DIR_INDEX_KEY ||
key->type == BTRFS_DIR_ITEM_KEY ||
key->type == BTRFS_EXTENT_DATA_KEY);
@@ -1209,7 +1210,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
/*
* For legacy root item, the members starting at generation_v2 will be
* all filled with 0.
- * And since we allow geneartion_v2 as 0, it will still pass the check.
+ * And since we allow generation_v2 as 0, it will still pass the check.
*/
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
btrfs_item_size(leaf, slot));
@@ -1756,10 +1757,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (unlikely(ptr + sizeof(iref) > end)) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
- ptr, end, sizeof(iref));
+ ptr, end, sizeof(*iref));
return -EUCLEAN;
}
@@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf,
return 0;
}
+static int check_inode_extref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
+ unsigned long end = ptr + btrfs_item_size(leaf, slot);
+
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
+ return -EUCLEAN;
+
+ while (ptr < end) {
+ struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr;
+ u16 namelen;
+
+ if (unlikely(ptr + sizeof(*extref)) > end) {
+ inode_ref_err(leaf, slot,
+ "inode extref overflow, ptr %lu end %lu inode_extref size %zu",
+ ptr, end, sizeof(*extref));
+ return -EUCLEAN;
+ }
+
+ namelen = btrfs_inode_extref_name_len(leaf, extref);
+ if (unlikely(ptr + sizeof(*extref) + namelen > end)) {
+ inode_ref_err(leaf, slot,
+ "inode extref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+ ptr += sizeof(*extref) + namelen;
+ }
+ return 0;
+}
+
static int check_raid_stripe_extent(const struct extent_buffer *leaf,
const struct btrfs_key *key, int slot)
{
@@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_INODE_REF_KEY:
ret = check_inode_ref(leaf, key, prev_key, slot);
break;
+ case BTRFS_INODE_EXTREF_KEY:
+ ret = check_inode_extref(leaf, key, prev_key, slot);
+ break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
ret = check_block_group_item(leaf, key, slot);
break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 69e11557fd13..6aad6b65522b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#include "file-item.h"
#include "file.h"
#include "orphan.h"
+#include "print-tree.h"
#include "tree-checker.h"
#define MAX_CONFLICT_INODES 10
@@ -101,17 +102,134 @@ enum {
LOG_WALK_REPLAY_ALL,
};
+/*
+ * The walk control struct is used to pass state down the chain when processing
+ * the log tree. The stage field tells us which part of the log tree processing
+ * we are currently doing.
+ */
+struct walk_control {
+ /*
+ * Signal that we are freeing the metadata extents of a log tree.
+ * This is used at transaction commit time while freeing a log tree.
+ */
+ bool free;
+
+ /*
+ * Signal that we are pinning the metadata extents of a log tree and the
+ * data extents its leaves point to (if using mixed block groups).
+ * This happens in the first stage of log replay to ensure that during
+ * replay, while we are modifying subvolume trees, we don't overwrite
+ * the metadata extents of log trees.
+ */
+ bool pin;
+
+ /* What stage of the replay code we're currently in. */
+ int stage;
+
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+ */
+ bool ignore_cur_inode;
+
+ /*
+ * The root we are currently replaying to. This is NULL for the replay
+ * stage LOG_WALK_PIN_ONLY.
+ */
+ struct btrfs_root *root;
+
+ /* The log tree we are currently processing (not NULL for any stage). */
+ struct btrfs_root *log;
+
+ /* The transaction handle used for replaying all log trees. */
+ struct btrfs_trans_handle *trans;
+
+ /*
+ * The function that gets used to process blocks we find in the tree.
+ * Note the extent_buffer might not be up to date when it is passed in,
+ * and it must be checked or read if you need the data inside it.
+ */
+ int (*process_func)(struct extent_buffer *eb,
+ struct walk_control *wc, u64 gen, int level);
+
+ /*
+ * The following are used only when stage is >= LOG_WALK_REPLAY_INODES
+ * and by the replay_one_buffer() callback.
+ */
+
+ /* The current log leaf being processed. */
+ struct extent_buffer *log_leaf;
+ /* The key being processed of the current log leaf. */
+ struct btrfs_key log_key;
+ /* The slot being processed of the current log leaf. */
+ int log_slot;
+
+ /* A path used for searches and modifications to subvolume trees. */
+ struct btrfs_path *subvol_path;
+};
+
+static void do_abort_log_replay(struct walk_control *wc, const char *function,
+ unsigned int line, int error, const char *fmt, ...)
+{
+ struct btrfs_fs_info *fs_info = wc->trans->fs_info;
+ struct va_format vaf;
+ va_list args;
+
+ /*
+ * Do nothing if we already aborted, to avoid dumping leaves again which
+ * can be verbose. Further more, only the first call is useful since it
+ * is where we have a problem. Note that we do not use the flag
+ * BTRFS_FS_STATE_TRANS_ABORTED because log replay calls functions that
+ * are outside of tree-log.c that can abort transactions (such as
+ * btrfs_add_link() for example), so if that happens we still want to
+ * dump all log replay specific information below.
+ */
+ if (test_and_set_bit(BTRFS_FS_STATE_LOG_REPLAY_ABORTED, &fs_info->fs_state))
+ return;
+
+ btrfs_abort_transaction(wc->trans, error);
+
+ if (wc->subvol_path->nodes[0]) {
+ btrfs_crit(fs_info,
+ "subvolume (root %llu) leaf currently being processed:",
+ btrfs_root_id(wc->root));
+ btrfs_print_leaf(wc->subvol_path->nodes[0]);
+ }
+
+ if (wc->log_leaf) {
+ btrfs_crit(fs_info,
+ "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):",
+ btrfs_root_id(wc->root), wc->log_slot,
+ wc->log_key.objectid, wc->log_key.type, wc->log_key.offset);
+ btrfs_print_leaf(wc->log_leaf);
+ }
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "log replay failed in %s:%u for root %llu, stage %d, with error %d: %pV",
+ function, line, btrfs_root_id(wc->root), wc->stage, error, &vaf);
+
+ va_end(args);
+}
+
+/*
+ * Use this for aborting a transaction during log replay while we are down the
+ * call chain of replay_one_buffer(), so that we get a lot more useful
+ * information for debugging issues when compared to a plain call to
+ * btrfs_abort_transaction().
+ */
+#define btrfs_abort_log_replay(wc, error, fmt, args...) \
+ do_abort_log_replay((wc), __func__, __LINE__, (error), fmt, ##args)
+
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
-static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static int link_to_fixup_dir(struct walk_control *wc, u64 objectid);
+static noinline int replay_dir_deletes(struct walk_control *wc,
u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
@@ -300,53 +418,13 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
/*
- * the walk control struct is used to pass state down the chain when
- * processing the log tree. The stage field tells us which part
- * of the log tree processing we are currently doing. The others
- * are state fields used for that specific part
- */
-struct walk_control {
- /* should we free the extent on disk when done? This is used
- * at transaction commit time while freeing a log tree
- */
- int free;
-
- /* pin only walk, we record which extents on disk belong to the
- * log trees
- */
- int pin;
-
- /* what stage of the replay code we're currently in */
- int stage;
-
- /*
- * Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY.
- */
- bool ignore_cur_inode;
-
- /* the root we are currently replaying */
- struct btrfs_root *replay_dest;
-
- /* the trans handle for the current replay */
- struct btrfs_trans_handle *trans;
-
- /* the function that gets used to process blocks we find in the
- * tree. Note the extent_buffer might not be up to date when it is
- * passed in, and it must be checked or read if you need the data
- * inside it
- */
- int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen, int level);
-};
-
-/*
* process_func used to pin down extents, write them or wait on them
*/
-static int process_one_buffer(struct btrfs_root *log,
- struct extent_buffer *eb,
+static int process_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
+ struct btrfs_root *log = wc->log;
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -361,25 +439,36 @@ static int process_one_buffer(struct btrfs_root *log,
};
ret = btrfs_read_extent_buffer(eb, &check);
- if (ret)
+ if (unlikely(ret)) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
+ }
}
if (wc->pin) {
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
- if (ret)
+ ASSERT(trans != NULL);
+ ret = btrfs_pin_extent_for_log_replay(trans, eb);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
- if (btrfs_buffer_uptodate(eb, gen, 0) &&
- btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) {
ret = btrfs_exclude_logged_extents(eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
}
return ret;
}
/*
- * Item overwrite used by log replay. The given eb, slot and key all refer to
- * the source data we are copying out.
+ * Item overwrite used by log replay. The given log tree leaf, slot and key
+ * from the walk_control structure all refer to the source data we are copying
+ * out.
*
* The given root is for the tree we are copying into, and path is a scratch
* path for use in this function (it should be released on entry and will be
@@ -391,12 +480,10 @@ static int process_one_buffer(struct btrfs_root *log,
*
* If the key isn't in the destination yet, a new item is inserted.
*/
-static int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int overwrite_item(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
int ret;
u32 item_size;
u64 saved_i_size = 0;
@@ -405,7 +492,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
unsigned long dst_ptr;
struct extent_buffer *dst_eb;
int dst_slot;
- bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
+ const bool is_inode_item = (wc->log_key.type == BTRFS_INODE_ITEM_KEY);
/*
* This is only used during log replay, so the root is always from a
@@ -416,16 +503,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
*/
ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
- item_size = btrfs_item_size(eb, slot);
- src_ptr = btrfs_item_ptr_offset(eb, slot);
+ item_size = btrfs_item_size(wc->log_leaf, wc->log_slot);
+ src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
/* Look for the key in the destination tree. */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key (%llu %u %llu) root %llu",
+ wc->log_key.objectid, wc->log_key.type,
+ wc->log_key.offset, btrfs_root_id(root));
return ret;
+ }
- dst_eb = path->nodes[0];
- dst_slot = path->slots[0];
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
if (ret == 0) {
char *src_copy;
@@ -435,16 +527,17 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
goto insert;
if (item_size == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
src_copy = kmalloc(item_size, GFP_NOFS);
if (!src_copy) {
- btrfs_release_path(path);
+ btrfs_abort_log_replay(wc, -ENOMEM,
+ "failed to allocate memory for log leaf item");
return -ENOMEM;
}
- read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ read_extent_buffer(wc->log_leaf, src_copy, src_ptr, item_size);
dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
@@ -456,7 +549,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* sync
*/
if (ret == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -464,7 +557,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
- if (inode_item) {
+ if (is_inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
@@ -472,20 +565,20 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
nbytes = btrfs_inode_nbytes(dst_eb, item);
- item = btrfs_item_ptr(eb, slot,
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot,
struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, nbytes);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, nbytes);
/*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
- } else if (inode_item) {
+ } else if (is_inode_item) {
struct btrfs_inode_item *item;
u32 mode;
@@ -493,38 +586,41 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
- item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
- btrfs_set_inode_nbytes(eb, item, 0);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_inode_item);
+ btrfs_set_inode_nbytes(wc->log_leaf, item, 0);
/*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
- mode = btrfs_inode_mode(eb, item);
+ mode = btrfs_inode_mode(wc->log_leaf, item);
if (S_ISDIR(mode))
- btrfs_set_inode_size(eb, item, 0);
+ btrfs_set_inode_size(wc->log_leaf, item, 0);
}
insert:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* try to insert the key into the destination tree */
- path->skip_release_on_error = 1;
- ret = btrfs_insert_empty_item(trans, root, path,
- key, item_size);
- path->skip_release_on_error = 0;
+ wc->subvol_path->skip_release_on_error = 1;
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size);
+ wc->subvol_path->skip_release_on_error = 0;
- dst_eb = path->nodes[0];
- dst_slot = path->slots[0];
+ dst_eb = wc->subvol_path->nodes[0];
+ dst_slot = wc->subvol_path->slots[0];
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
if (found_size > item_size)
- btrfs_truncate_item(trans, path, item_size, 1);
+ btrfs_truncate_item(trans, wc->subvol_path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(trans, path, item_size - found_size);
+ btrfs_extend_item(trans, wc->subvol_path, item_size - found_size);
} else if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item for key (%llu %u %llu)",
+ wc->log_key.objectid, wc->log_key.type,
+ wc->log_key.offset);
return ret;
}
dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
@@ -538,15 +634,15 @@ insert:
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
- if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+ if (is_inode_item && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0) {
- const u64 ino_size = btrfs_inode_size(eb, src_item);
+ if (btrfs_inode_generation(wc->log_leaf, src_item) == 0) {
+ const u64 ino_size = btrfs_inode_size(wc->log_leaf, src_item);
/*
* For regular files an ino_size == 0 is used only when
@@ -555,21 +651,21 @@ insert:
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
- if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISREG(btrfs_inode_mode(wc->log_leaf, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
- if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+ if (S_ISDIR(btrfs_inode_mode(wc->log_leaf, src_item)) &&
S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
+ copy_extent_buffer(dst_eb, wc->log_leaf, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
@@ -579,7 +675,7 @@ insert:
}
/* make sure the generation is filled in */
- if (key->type == BTRFS_INODE_ITEM_KEY) {
+ if (is_inode_item) {
struct btrfs_inode_item *dst_item;
dst_item = (struct btrfs_inode_item *)dst_ptr;
@@ -587,7 +683,7 @@ insert:
btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -618,292 +714,354 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
-static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_extent(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
- u64 start = key->offset;
+ const u64 start = wc->log_key.offset;
u64 nbytes = 0;
+ u64 csum_start;
+ u64 csum_end;
+ LIST_HEAD(ordered_sums);
+ u64 offset;
+ unsigned long dest_offset;
+ struct btrfs_key ins;
struct btrfs_file_extent_item *item;
struct btrfs_inode *inode = NULL;
- unsigned long size;
int ret = 0;
- item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(eb, item);
+ item = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(wc->log_leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- nbytes = btrfs_file_extent_num_bytes(eb, item);
- extent_end = start + nbytes;
-
- /*
- * We don't add to the inodes nbytes if we are prealloc or a
- * hole.
- */
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
- nbytes = 0;
+ extent_end = start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ /* Holes don't take up space. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) != 0)
+ nbytes = btrfs_file_extent_num_bytes(wc->log_leaf, item);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_ram_bytes(eb, item);
- nbytes = btrfs_file_extent_ram_bytes(eb, item);
- extent_end = ALIGN(start + size,
- fs_info->sectorsize);
+ nbytes = btrfs_file_extent_ram_bytes(wc->log_leaf, item);
+ extent_end = ALIGN(start + nbytes, fs_info->sectorsize);
} else {
- btrfs_err(fs_info,
- "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
- found_type, btrfs_root_id(root), key->objectid, key->offset);
+ btrfs_abort_log_replay(wc, -EUCLEAN,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root),
+ wc->log_key.objectid, wc->log_key.offset);
return -EUCLEAN;
}
- inode = btrfs_iget_logging(key->objectid, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get inode %llu for root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
+ }
/*
* first check to see if we already have this extent in the
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, wc->subvol_path,
+ btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_file_extent_item existing;
unsigned long ptr;
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ read_extent_buffer(leaf, &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
+ if (memcmp_extent_buffer(wc->log_leaf, &existing, (unsigned long)item,
sizeof(existing)) == 0) {
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
goto out;
}
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
+ drop_args.path = wc->subvol_path;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu range [%llu, %llu) root %llu",
+ wc->log_key.objectid, start, extent_end,
+ btrfs_root_id(root));
goto out;
+ }
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 offset;
- unsigned long dest_offset;
- struct btrfs_key ins;
-
- if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
- goto update_inode;
-
- ret = btrfs_insert_empty_item(trans, root, path, key,
- sizeof(*item));
+ if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ /* inline extents are easy, we just overwrite them */
+ ret = overwrite_item(wc);
if (ret)
goto out;
- dest_offset = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
- copy_extent_buffer(path->nodes[0], eb, dest_offset,
- (unsigned long)item, sizeof(*item));
+ goto update_inode;
+ }
- ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.type = BTRFS_EXTENT_ITEM_KEY;
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- offset = key->offset - btrfs_file_extent_offset(eb, item);
+ /*
+ * If not an inline extent, it can only be a regular or prealloc one.
+ * We have checked that above and returned -EUCLEAN if not.
+ */
- /*
- * Manually record dirty extent, as here we did a shallow
- * file extent item copy and skip normal backref update,
- * but modifying extent tree all by ourselves.
- * So need to manually record dirty extent for qgroup,
- * as the owner of the file extent changed from log tree
- * (doesn't affect qgroup) to fs/file tree(affects qgroup)
- */
- ret = btrfs_qgroup_trace_extent(trans,
- btrfs_file_extent_disk_bytenr(eb, item),
- btrfs_file_extent_disk_num_bytes(eb, item));
- if (ret < 0)
- goto out;
+ /* A hole and NO_HOLES feature enabled, nothing else to do. */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ goto update_inode;
- if (ins.objectid > 0) {
- u64 csum_start;
- u64 csum_end;
- LIST_HEAD(ordered_sums);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path,
+ &wc->log_key, sizeof(*item));
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert item with key (%llu %u %llu) root %llu",
+ wc->log_key.objectid, wc->log_key.type,
+ wc->log_key.offset, btrfs_root_id(root));
+ goto out;
+ }
+ dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0],
+ wc->subvol_path->slots[0]);
+ copy_extent_buffer(wc->subvol_path->nodes[0], wc->log_leaf, dest_offset,
+ (unsigned long)item, sizeof(*item));
- /*
- * is this extent already allocated in the extent
- * allocation tree? If so, just add a reference
- */
- ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
- ins.offset);
- if (ret < 0) {
- goto out;
- } else if (ret == 0) {
- struct btrfs_ref ref = {
- .action = BTRFS_ADD_DELAYED_REF,
- .bytenr = ins.objectid,
- .num_bytes = ins.offset,
- .owning_root = btrfs_root_id(root),
- .ref_root = btrfs_root_id(root),
- };
- btrfs_init_data_ref(&ref, key->objectid, offset,
- 0, false);
- ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret)
- goto out;
- } else {
- /*
- * insert the extent pointer in the extent
- * allocation tree
- */
- ret = btrfs_alloc_logged_file_extent(trans,
- btrfs_root_id(root),
- key->objectid, offset, &ins);
- if (ret)
- goto out;
- }
- btrfs_release_path(path);
+ /*
+ * We have an explicit hole and NO_HOLES is not enabled. We have added
+ * the hole file extent item to the subvolume tree, so we don't have
+ * anything else to do other than update the file extent item range and
+ * update the inode item.
+ */
+ if (btrfs_file_extent_disk_bytenr(wc->log_leaf, item) == 0) {
+ btrfs_release_path(wc->subvol_path);
+ goto update_inode;
+ }
- if (btrfs_file_extent_compression(eb, item)) {
- csum_start = ins.objectid;
- csum_end = csum_start + ins.offset;
- } else {
- csum_start = ins.objectid +
- btrfs_file_extent_offset(eb, item);
- csum_end = csum_start +
- btrfs_file_extent_num_bytes(eb, item);
- }
+ ins.objectid = btrfs_file_extent_disk_bytenr(wc->log_leaf, item);
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(wc->log_leaf, item);
+ offset = wc->log_key.offset - btrfs_file_extent_offset(wc->log_leaf, item);
- ret = btrfs_lookup_csums_list(root->log_root,
- csum_start, csum_end - 1,
- &ordered_sums, false);
- if (ret < 0)
- goto out;
- ret = 0;
- /*
- * Now delete all existing cums in the csum root that
- * cover our range. We do this because we can have an
- * extent that is completely referenced by one file
- * extent item and partially referenced by another
- * file extent item (like after using the clone or
- * extent_same ioctls). In this case if we end up doing
- * the replay of the one that partially references the
- * extent first, and we do not do the csum deletion
- * below, we can get 2 csum items in the csum tree that
- * overlap each other. For example, imagine our log has
- * the two following file extent items:
- *
- * key (257 EXTENT_DATA 409600)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 20480 nr 20480 ram 102400
- *
- * key (257 EXTENT_DATA 819200)
- * extent data disk byte 12845056 nr 102400
- * extent data offset 0 nr 102400 ram 102400
- *
- * Where the second one fully references the 100K extent
- * that starts at disk byte 12845056, and the log tree
- * has a single csum item that covers the entire range
- * of the extent:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- *
- * After the first file extent item is replayed, the
- * csum tree gets the following csum item:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which covers the 20K sub-range starting at offset 20K
- * of our extent. Now when we replay the second file
- * extent item, if we do not delete existing csum items
- * that cover any of its blocks, we end up getting two
- * csum items in our csum tree that overlap each other:
- *
- * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
- * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
- *
- * Which is a problem, because after this anyone trying
- * to lookup up for the checksum of any block of our
- * extent starting at an offset of 40K or higher, will
- * end up looking at the second csum item only, which
- * does not contain the checksum for any block starting
- * at offset 40K or higher of our extent.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums;
- struct btrfs_root *csum_root;
-
- sums = list_first_entry(&ordered_sums,
- struct btrfs_ordered_sum,
- list);
- csum_root = btrfs_csum_root(fs_info,
- sums->logical);
- if (!ret)
- ret = btrfs_del_csums(trans, csum_root,
- sums->logical,
- sums->len);
- if (!ret)
- ret = btrfs_csum_file_blocks(trans,
- csum_root,
- sums);
- list_del(&sums->list);
- kfree(sums);
- }
- if (ret)
- goto out;
- } else {
- btrfs_release_path(path);
+ /*
+ * Manually record dirty extent, as here we did a shallow file extent
+ * item copy and skip normal backref update, but modifying extent tree
+ * all by ourselves. So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree (doesn't affect
+ * qgroup) to fs/file tree (affects qgroup).
+ */
+ ret = btrfs_qgroup_trace_extent(trans, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to trace extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ }
+
+ /*
+ * Is this extent already allocated in the extent tree?
+ * If so, just add a reference.
+ */
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid, btrfs_root_id(root));
+ goto out;
+ } else if (ret == 0) {
+ struct btrfs_ref ref = {
+ .action = BTRFS_ADD_DELAYED_REF,
+ .bytenr = ins.objectid,
+ .num_bytes = ins.offset,
+ .owning_root = btrfs_root_id(root),
+ .ref_root = btrfs_root_id(root),
+ };
+
+ btrfs_init_data_ref(&ref, wc->log_key.objectid, offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to increment data extent for bytenr %llu disk_num_bytes %llu inode %llu root %llu",
+ ins.objectid, ins.offset,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
}
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- /* inline extents are easy, we just overwrite them */
- ret = overwrite_item(trans, root, path, eb, slot, key);
- if (ret)
+ } else {
+ /* Insert the extent pointer in the extent tree. */
+ ret = btrfs_alloc_logged_file_extent(trans, btrfs_root_id(root),
+ wc->log_key.objectid, offset, &ins);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to allocate logged data extent for bytenr %llu disk_num_bytes %llu offset %llu inode %llu root %llu",
+ ins.objectid, ins.offset, offset,
+ wc->log_key.objectid, btrfs_root_id(root));
goto out;
+ }
}
- ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ btrfs_release_path(wc->subvol_path);
+
+ if (btrfs_file_extent_compression(wc->log_leaf, item)) {
+ csum_start = ins.objectid;
+ csum_end = csum_start + ins.offset;
+ } else {
+ csum_start = ins.objectid + btrfs_file_extent_offset(wc->log_leaf, item);
+ csum_end = csum_start + btrfs_file_extent_num_bytes(wc->log_leaf, item);
+ }
+
+ ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1,
+ &ordered_sums, false);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookups csums for range [%llu, %llu) inode %llu root %llu",
+ csum_start, csum_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+ ret = 0;
+ /*
+ * Now delete all existing cums in the csum root that cover our range.
+ * We do this because we can have an extent that is completely
+ * referenced by one file extent item and partially referenced by
+ * another file extent item (like after using the clone or extent_same
+ * ioctls). In this case if we end up doing the replay of the one that
+ * partially references the extent first, and we do not do the csum
+ * deletion below, we can get 2 csum items in the csum tree that overlap
+ * each other. For example, imagine our log has the two following file
+ * extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent that starts at
+ * disk byte 12845056, and the log tree has a single csum item that
+ * covers the entire range of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the csum tree gets the
+ * following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K of our extent.
+ * Now when we replay the second file extent item, if we do not delete
+ * existing csum items that cover any of its blocks, we end up getting
+ * two csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying to lookup for
+ * the checksum of any block of our extent starting at an offset of 40K
+ * or higher, will end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting at offset 40K or
+ * higher of our extent.
+ */
+ while (!list_empty(&ordered_sums)) {
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
+ sums = list_first_entry(&ordered_sums, struct btrfs_ordered_sum, list);
+ csum_root = btrfs_csum_root(fs_info, sums->logical);
+ if (!ret) {
+ ret = btrfs_del_csums(trans, csum_root, sums->logical,
+ sums->len);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ if (!ret) {
+ ret = btrfs_csum_file_blocks(trans, csum_root, sums);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to add csums for range [%llu, %llu) inode %llu root %llu",
+ sums->logical,
+ sums->logical + sums->len,
+ wc->log_key.objectid,
+ btrfs_root_id(root));
+ }
+ list_del(&sums->list);
+ kfree(sums);
+ }
if (ret)
goto out;
update_inode:
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to set file extent range [%llu, %llu) inode %llu root %llu",
+ start, extent_end, wc->log_key.objectid,
+ btrfs_root_id(root));
+ goto out;
+ }
+
btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
out:
iput(&inode->vfs_inode);
return ret;
}
-static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+static int unlink_inode_for_log_replay(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const struct fscrypt_str *name)
{
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
ret = btrfs_unlink_inode(trans, dir, inode, name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to unlink inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
return ret;
+ }
/*
* Whenever we need to check if a name exists or not, we check the
* fs/subvolume tree. So after an unlink we must run delayed items, so
* that future checks for a name during log replay see that the name
* does not exists anymore.
*/
- return btrfs_run_delayed_items(trans);
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+"failed to run delayed items current inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir), name->len,
+ name->name, btrfs_root_id(inode->root));
+
+ return ret;
}
/*
@@ -914,39 +1072,44 @@ static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
* This is a helper function to do the unlink of a specific directory
* item
*/
-static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+static noinline int drop_one_dir_item(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
struct btrfs_inode *inode;
struct fscrypt_str name;
- struct extent_buffer *leaf;
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
struct btrfs_key location;
int ret;
- leaf = path->nodes[0];
-
btrfs_dir_item_key_to_cpu(leaf, di, &location);
ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
- if (ret)
- return -ENOMEM;
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
+ return ret;
+ }
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
inode = btrfs_iget_logging(location.objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to open inode %llu parent dir %llu name %.*s root %llu",
+ location.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
inode = NULL;
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
out:
kfree(name.name);
if (inode)
@@ -1013,7 +1176,7 @@ static noinline int backref_in_log(struct btrfs_root *log,
u64 ref_objectid,
const struct fscrypt_str *name)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
path = btrfs_alloc_path();
@@ -1021,12 +1184,10 @@ static noinline int backref_in_log(struct btrfs_root *log,
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret == 1) {
- ret = 0;
- goto out;
- }
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return 0;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
@@ -1035,20 +1196,15 @@ static noinline int backref_in_log(struct btrfs_root *log,
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0], name);
-out:
- btrfs_free_path(path);
return ret;
}
-static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_root *log_root,
+static int unlink_refs_not_in_log(struct walk_control *wc,
struct btrfs_key *search_key,
struct btrfs_inode *dir,
- struct btrfs_inode *inode,
- u64 parent_objectid)
+ struct btrfs_inode *inode)
{
- struct extent_buffer *leaf = path->nodes[0];
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
unsigned long ptr;
unsigned long ptr_end;
@@ -1057,8 +1213,8 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
* log. If so, we allow them to stay otherwise they must be unlinked as
* a conflict.
*/
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, wc->subvol_path->slots[0]);
while (ptr < ptr_end) {
struct fscrypt_str victim_name;
struct btrfs_inode_ref *victim_ref;
@@ -1068,22 +1224,34 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
ret = read_alloc_one_name(leaf, (victim_ref + 1),
btrfs_inode_ref_name_len(leaf, victim_ref),
&victim_name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
return ret;
+ }
- ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+ ret = backref_in_log(wc->log, search_key, btrfs_ino(dir), &victim_name);
if (ret) {
- kfree(victim_name.name);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
+ kfree(victim_name.name);
return ret;
+ }
+ kfree(victim_name.name);
ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
continue;
}
inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1093,64 +1261,64 @@ static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
return 0;
}
-static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_root *root,
- struct btrfs_root *log_root,
+static int unlink_extrefs_not_in_log(struct walk_control *wc,
struct btrfs_key *search_key,
- struct btrfs_inode *inode,
- u64 inode_objectid,
- u64 parent_objectid)
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode)
{
- struct extent_buffer *leaf = path->nodes[0];
- const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]);
- const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+ struct extent_buffer *leaf = wc->subvol_path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, wc->subvol_path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, wc->subvol_path->slots[0]);
u32 cur_offset = 0;
while (cur_offset < item_size) {
+ struct btrfs_root *log_root = wc->log;
struct btrfs_inode_extref *extref;
- struct btrfs_inode *victim_parent;
struct fscrypt_str victim_name;
int ret;
extref = (struct btrfs_inode_extref *)(base + cur_offset);
victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+ if (btrfs_inode_extref_parent(leaf, extref) != btrfs_ino(dir))
goto next;
ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
&victim_name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for inode %llu parent dir %llu root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ btrfs_root_id(inode->root));
return ret;
+ }
- search_key->objectid = inode_objectid;
+ search_key->objectid = btrfs_ino(inode);
search_key->type = BTRFS_INODE_EXTREF_KEY;
- search_key->offset = btrfs_extref_hash(parent_objectid,
+ search_key->offset = btrfs_extref_hash(btrfs_ino(dir),
victim_name.name,
victim_name.len);
- ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+ ret = backref_in_log(log_root, search_key, btrfs_ino(dir), &victim_name);
if (ret) {
- kfree(victim_name.name);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if backref is in log tree for inode %llu parent dir %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ victim_name.len, victim_name.name,
+ btrfs_root_id(inode->root));
+ kfree(victim_name.name);
return ret;
+ }
+ kfree(victim_name.name);
next:
cur_offset += victim_name.len + sizeof(*extref);
continue;
}
- victim_parent = btrfs_iget_logging(parent_objectid, root);
- if (IS_ERR(victim_parent)) {
- kfree(victim_name.name);
- return PTR_ERR(victim_parent);
- }
-
inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- ret = unlink_inode_for_log_replay(trans, victim_parent, inode,
- &victim_name);
- iput(&victim_parent->vfs_inode);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &victim_name);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1160,27 +1328,29 @@ next:
return 0;
}
-static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_root *log_root,
+static inline int __add_inode_ref(struct walk_control *wc,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- u64 inode_objectid, u64 parent_objectid,
u64 ref_index, struct fscrypt_str *name)
{
int ret;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;
again:
/* Search old style refs */
- search_key.objectid = inode_objectid;
+ search_key.objectid = btrfs_ino(inode);
search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = parent_objectid;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ search_key.offset = btrfs_ino(dir);
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key (%llu %u %llu) root %llu",
+ search_key.objectid, search_key.type,
+ search_key.offset, btrfs_root_id(root));
return ret;
} else if (ret == 0) {
/*
@@ -1190,52 +1360,60 @@ again:
if (search_key.objectid == search_key.offset)
return 1;
- ret = unlink_refs_not_in_log(trans, path, log_root, &search_key,
- dir, inode, parent_objectid);
+ ret = unlink_refs_not_in_log(wc, &search_key, dir, inode);
if (ret == -EAGAIN)
goto again;
else if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid);
+ extref = btrfs_lookup_inode_extref(root, wc->subvol_path, name,
+ btrfs_ino(inode), btrfs_ino(dir));
if (IS_ERR(extref)) {
return PTR_ERR(extref);
} else if (extref) {
- ret = unlink_extrefs_not_in_log(trans, path, root, log_root,
- &search_key, inode,
- inode_objectid, parent_objectid);
+ ret = unlink_extrefs_not_in_log(wc, &search_key, dir, inode);
if (ret == -EAGAIN)
goto again;
else if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting sequence number */
- di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+ di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path, btrfs_ino(dir),
ref_index, name, 0);
if (IS_ERR(di)) {
- return PTR_ERR(di);
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+"failed to lookup dir index item for dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(dir), ref_index, name->len,
+ name->name, btrfs_root_id(root));
+ return ret;
} else if (di) {
- ret = drop_one_dir_item(trans, path, dir, di);
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
/* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
+ di = btrfs_lookup_dir_item(trans, root, wc->subvol_path, btrfs_ino(dir), name, 0);
if (IS_ERR(di)) {
- return PTR_ERR(di);
+ ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name->len, name->name,
+ btrfs_root_id(root));
+ return ret;
} else if (di) {
- ret = drop_one_dir_item(trans, path, dir, di);
+ ret = drop_one_dir_item(wc, dir, di);
if (ret)
return ret;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return 0;
}
@@ -1288,63 +1466,79 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
-static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_inode *inode,
- struct extent_buffer *log_eb,
- int log_slot,
- struct btrfs_key *key)
+static int unlink_old_inode_refs(struct walk_control *wc, struct btrfs_inode *inode)
{
+ struct btrfs_root *root = wc->root;
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;
again:
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ btrfs_release_path(wc->subvol_path);
+ ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0);
if (ret > 0) {
ret = 0;
goto out;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search subvolume tree for key (%llu %u %llu) root %llu",
+ wc->log_key.objectid, wc->log_key.type,
+ wc->log_key.offset, btrfs_root_id(root));
goto out;
+ }
- eb = path->nodes[0];
- ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
+ eb = wc->subvol_path->nodes[0];
+ ref_ptr = btrfs_item_ptr_offset(eb, wc->subvol_path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, wc->subvol_path->slots[0]);
while (ref_ptr < ref_end) {
struct fscrypt_str name;
u64 parent_id;
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
ret = extref_get_fields(eb, ref_ptr, &name,
NULL, &parent_id);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
+ goto out;
+ }
} else {
- parent_id = key->offset;
+ parent_id = wc->log_key.offset;
ret = ref_get_fields(eb, ref_ptr, &name, NULL);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_id %llu root %llu",
+ btrfs_ino(inode), parent_id,
+ btrfs_root_id(root));
+ goto out;
+ }
}
- if (ret)
- goto out;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(wc->log_leaf, wc->log_slot,
parent_id, &name);
else
- ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
+ ret = !!btrfs_find_name_in_backref(wc->log_leaf, wc->log_slot,
+ &name);
if (!ret) {
struct btrfs_inode *dir;
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
dir = btrfs_iget_logging(parent_id, root);
if (IS_ERR(dir)) {
ret = PTR_ERR(dir);
kfree(name.name);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_id, btrfs_root_id(root));
goto out;
}
- ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
kfree(name.name);
iput(&dir->vfs_inode);
if (ret)
@@ -1354,56 +1548,51 @@ again:
kfree(name.name);
ref_ptr += name.len;
- if (key->type == BTRFS_INODE_EXTREF_KEY)
+ if (wc->log_key.type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function. (it should be released on return).
+ * Replay one inode back reference item found in the log tree.
+ * Path is for temporary use by this function (it should be released on return).
*/
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int add_inode_ref(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_inode *dir = NULL;
struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
int ret;
- const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
+ const bool is_extref_item = (wc->log_key.type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
- ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size(eb, slot);
+ ref_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot);
+ ref_end = ref_ptr + btrfs_item_size(wc->log_leaf, wc->log_slot);
if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
r = (struct btrfs_inode_extref *)ref_ptr;
- parent_objectid = btrfs_inode_extref_parent(eb, r);
+ parent_objectid = btrfs_inode_extref_parent(wc->log_leaf, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
- parent_objectid = key->offset;
+ parent_objectid = wc->log_key.offset;
}
- inode_objectid = key->objectid;
+ inode_objectid = wc->log_key.objectid;
/*
* it is possible that we didn't log all the parent directories
@@ -1416,6 +1605,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir);
if (ret == -ENOENT)
ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid, btrfs_root_id(root));
dir = NULL;
goto out;
}
@@ -1423,16 +1616,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
inode = btrfs_iget_logging(inode_objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ inode_objectid, btrfs_root_id(root));
inode = NULL;
goto out;
}
while (ref_ptr < ref_end) {
if (is_extref_item) {
- ret = extref_get_fields(eb, ref_ptr, &name,
+ ret = extref_get_fields(wc->log_leaf, ref_ptr, &name,
&ref_index, &parent_objectid);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get extref details for inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
goto out;
+ }
/*
* parent object can change from one array
* item to another.
@@ -1457,19 +1658,35 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
ret = 0;
goto next;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ parent_objectid,
+ btrfs_root_id(root));
}
goto out;
}
}
} else {
- ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
- if (ret)
+ ret = ref_get_fields(wc->log_leaf, ref_ptr, &name, &ref_index);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get ref details for inode %llu parent_objectid %llu root %llu",
+ btrfs_ino(inode),
+ parent_objectid,
+ btrfs_root_id(root));
goto out;
+ }
}
- ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
- ref_index, &name);
+ ret = inode_in_dir(root, wc->subvol_path, btrfs_ino(dir),
+ btrfs_ino(inode), ref_index, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if inode %llu is in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode), btrfs_ino(dir),
+ ref_index, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (ret == 0) {
/*
@@ -1479,9 +1696,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log, dir, inode,
- inode_objectid, parent_objectid,
- ref_index, &name);
+ ret = __add_inode_ref(wc, dir, inode, ref_index, &name);
if (ret) {
if (ret == 1)
ret = 0;
@@ -1490,12 +1705,24 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
/* insert our name */
ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu",
+ btrfs_ino(inode),
+ btrfs_ino(dir), ref_index,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
ret = btrfs_update_inode(trans, inode);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
goto out;
+ }
}
/* Else, ret == 1, we already have a perfect match, we're done. */
@@ -1517,14 +1744,14 @@ next:
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
+ ret = unlink_old_inode_refs(wc, inode);
if (ret)
goto out;
/* finally write the back reference in the inode */
- ret = overwrite_item(trans, root, path, eb, slot, key);
+ ret = overwrite_item(wc);
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
kfree(name.name);
if (dir)
iput(&dir->vfs_inode);
@@ -1642,26 +1869,22 @@ process_slot:
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+static noinline int fixup_inode_link_count(struct walk_control *wc,
struct btrfs_inode *inode)
{
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_root *root = inode->root;
- struct btrfs_path *path;
int ret;
u64 nlink = 0;
const u64 ino = btrfs_ino(inode);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- ret = count_inode_refs(inode, path);
+ ret = count_inode_refs(inode, wc->subvol_path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(inode, path);
+ ret = count_inode_extrefs(inode, wc->subvol_path);
if (ret < 0)
goto out;
@@ -1680,7 +1903,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (inode->vfs_inode.i_nlink == 0) {
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path, ino, true);
+ ret = replay_dir_deletes(wc, ino, true);
if (ret)
goto out;
}
@@ -1690,13 +1913,11 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
out:
- btrfs_free_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
-static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline int fixup_inode_link_counts(struct walk_control *wc)
{
int ret;
struct btrfs_key key;
@@ -1705,48 +1926,50 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_inode *inode;
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ ret = btrfs_search_slot(trans, root, &key, wc->subvol_path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
ret = 0;
- if (path->slots[0] == 0)
+ if (wc->subvol_path->slots[0] == 0)
break;
- path->slots[0]--;
+ wc->subvol_path->slots[0]--;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, wc->subvol_path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_del_item(trans, root, path);
+ ret = btrfs_del_item(trans, root, wc->subvol_path);
if (ret)
break;
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
inode = btrfs_iget_logging(key.offset, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
break;
}
- ret = fixup_inode_link_count(trans, inode);
+ ret = fixup_inode_link_count(wc, inode);
iput(&inode->vfs_inode);
if (ret)
break;
/*
* fixup on a directory may create new entries,
- * make sure we always look for the highset possible
+ * make sure we always look for the highest possible
* offset
*/
key.offset = (u64)-1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -1756,36 +1979,48 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
-static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 objectid)
+static noinline int link_to_fixup_dir(struct walk_control *wc, u64 objectid)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct btrfs_key key;
int ret = 0;
struct btrfs_inode *inode;
struct inode *vfs_inode;
inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ objectid, btrfs_root_id(root));
+ return ret;
+ }
vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &key, 0);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (ret == 0) {
if (!vfs_inode->i_nlink)
set_nlink(vfs_inode, 1);
else
inc_nlink(vfs_inode);
ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ objectid, btrfs_root_id(root));
} else if (ret == -EEXIST) {
ret = 0;
+ } else {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert fixup item for inode %llu root %llu",
+ objectid, btrfs_root_id(root));
}
iput(vfs_inode);
@@ -1826,9 +2061,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
return ret;
}
-static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+static int delete_conflicting_dir_entry(struct walk_control *wc,
struct btrfs_inode *dir,
- struct btrfs_path *path,
struct btrfs_dir_item *dst_di,
const struct btrfs_key *log_key,
u8 log_flags,
@@ -1836,12 +2070,12 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
{
struct btrfs_key found_key;
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ btrfs_dir_item_key_to_cpu(wc->subvol_path->nodes[0], dst_di, &found_key);
/* The existing dentry points to the same inode, don't delete it. */
if (found_key.objectid == log_key->objectid &&
found_key.type == log_key->type &&
found_key.offset == log_key->offset &&
- btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
+ btrfs_dir_flags(wc->subvol_path->nodes[0], dst_di) == log_flags)
return 1;
/*
@@ -1851,7 +2085,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
if (!exists)
return 0;
- return drop_one_dir_item(trans, path, dir, dst_di);
+ return drop_one_dir_item(wc, dir, dst_di);
}
/*
@@ -1870,13 +2104,10 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
-static noinline int replay_one_name(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb,
- struct btrfs_dir_item *di,
- struct btrfs_key *key)
+static noinline int replay_one_name(struct walk_control *wc, struct btrfs_dir_item *di)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
struct fscrypt_str name = { 0 };
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
@@ -1891,53 +2122,85 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool update_size = true;
bool name_added = false;
- dir = btrfs_iget_logging(key->objectid, root);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
+ dir = btrfs_iget_logging(wc->log_key.objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ wc->log_key.objectid, btrfs_root_id(root));
+ return ret;
+ }
- ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
- if (ret)
+ ret = read_alloc_one_name(wc->log_leaf, di + 1,
+ btrfs_dir_name_len(wc->log_leaf, di), &name);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
- log_flags = btrfs_dir_flags(eb, di);
- btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- btrfs_release_path(path);
- if (ret < 0)
+ log_flags = btrfs_dir_flags(wc->log_leaf, di);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &log_key);
+ ret = btrfs_lookup_inode(trans, root, wc->subvol_path, &log_key, 0);
+ btrfs_release_path(wc->subvol_path);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ log_key.objectid, btrfs_root_id(root));
goto out;
+ }
exists = (ret == 0);
ret = 0;
- dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- &name, 1);
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid, &name, 1);
if (IS_ERR(dir_dst_di)) {
ret = PTR_ERR(dir_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ ret = delete_conflicting_dir_entry(wc, dir, dir_dst_di,
&log_key, log_flags, exists);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
dir_dst_matches = (ret == 1);
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
- index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid, key->offset,
- &name, 1);
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, wc->subvol_path,
+ wc->log_key.objectid,
+ wc->log_key.offset, &name, 1);
if (IS_ERR(index_dst_di)) {
ret = PTR_ERR(index_dst_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu name %.*s root %llu",
+ wc->log_key.objectid, name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ ret = delete_conflicting_dir_entry(wc, dir, index_dst_di,
&log_key, log_flags, exists);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete conflicting entry for dir %llu name %.*s root %llu",
+ btrfs_ino(dir), name.len, name.name,
+ btrfs_root_id(root));
goto out;
+ }
index_dst_matches = (ret == 1);
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (dir_dst_matches && index_dst_matches) {
ret = 0;
@@ -1951,9 +2214,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
*/
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = key->objectid;
+ search_key.offset = wc->log_key.objectid;
ret = backref_in_log(root->log_root, &search_key, 0, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if ref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1964,9 +2231,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
+ search_key.offset = btrfs_extref_hash(wc->log_key.objectid, name.name, name.len);
+ ret = backref_in_log(root->log_root, &search_key, wc->log_key.objectid, &name);
if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+"failed to check if extref item is logged for inode %llu dir %llu name %.*s root %llu",
+ search_key.objectid, btrfs_ino(dir),
+ name.len, name.name, btrfs_root_id(root));
goto out;
} else if (ret) {
/* The dentry will be added later. */
@@ -1974,11 +2245,15 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
update_size = false;
goto out;
}
- btrfs_release_path(path);
- ret = insert_one_name(trans, root, key->objectid, key->offset,
+ ret = insert_one_name(trans, root, wc->log_key.objectid, wc->log_key.offset,
&name, &log_key);
- if (ret && ret != -ENOENT && ret != -EEXIST)
+ if (ret && ret != -ENOENT && ret != -EEXIST) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to insert name %.*s for inode %llu dir %llu root %llu",
+ name.len, name.name, log_key.objectid,
+ btrfs_ino(dir), btrfs_root_id(root));
goto out;
+ }
if (!ret)
name_added = true;
update_size = false;
@@ -1988,6 +2263,10 @@ out:
if (!ret && update_size) {
btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
ret = btrfs_update_inode(trans, dir);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update dir inode %llu root %llu",
+ btrfs_ino(dir), btrfs_root_id(root));
}
kfree(name.name);
iput(&dir->vfs_inode);
@@ -1997,20 +2276,16 @@ out:
}
/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
-static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static noinline int replay_one_dir_item(struct walk_control *wc)
{
int ret;
struct btrfs_dir_item *di;
/* We only log dir index keys, which only contain a single dir item. */
- ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
+ ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY);
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
- ret = replay_one_name(trans, root, path, eb, di, key);
+ di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item);
+ ret = replay_one_name(wc, di);
if (ret < 0)
return ret;
@@ -2040,17 +2315,11 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
* to ever delete the parent directory has it would result in stale
* dentries that can never be deleted.
*/
- if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_path *fixup_path;
+ if (ret == 1 && btrfs_dir_ftype(wc->log_leaf, di) != BTRFS_FT_DIR) {
struct btrfs_key di_key;
- fixup_path = btrfs_alloc_path();
- if (!fixup_path)
- return -ENOMEM;
-
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
- btrfs_free_path(fixup_path);
+ btrfs_dir_item_key_to_cpu(wc->log_leaf, di, &di_key);
+ ret = link_to_fixup_dir(wc, di_key.objectid);
}
return ret;
@@ -2143,13 +2412,13 @@ out:
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
-static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static noinline int check_item_in_log(struct walk_control *wc,
struct btrfs_path *log_path,
struct btrfs_inode *dir,
- struct btrfs_key *dir_key)
+ struct btrfs_key *dir_key,
+ bool force_remove)
{
+ struct btrfs_trans_handle *trans = wc->trans;
struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
@@ -2167,21 +2436,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
*/
ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
- eb = path->nodes[0];
- slot = path->slots[0];
+ eb = wc->subvol_path->nodes[0];
+ slot = wc->subvol_path->slots[0];
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
- if (ret)
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate name for dir %llu index %llu root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ btrfs_root_id(root));
goto out;
+ }
- if (log) {
+ if (!force_remove) {
struct btrfs_dir_item *log_di;
- log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ log_di = btrfs_lookup_dir_index_item(trans, wc->log, log_path,
dir_key->objectid,
dir_key->offset, &name, 0);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir index item for dir %llu index %llu name %.*s root %llu",
+ btrfs_ino(dir), dir_key->offset,
+ name.len, name.name,
+ btrfs_root_id(root));
goto out;
} else if (log_di) {
/* The dentry exists in the log, we have nothing to do. */
@@ -2191,28 +2470,31 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
}
btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
inode = btrfs_iget_logging(location.objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ location.objectid, btrfs_root_id(root));
goto out;
}
- ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ ret = link_to_fixup_dir(wc, location.objectid);
if (ret)
goto out;
inc_nlink(&inode->vfs_inode);
- ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
+ ret = unlink_inode_for_log_replay(wc, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
* (an index number), so we're done.
*/
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_release_path(log_path);
kfree(name.name);
if (inode)
@@ -2220,59 +2502,67 @@ out:
return ret;
}
-static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
- const u64 ino)
+static int replay_xattr_deletes(struct walk_control *wc)
{
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = wc->log;
struct btrfs_key search_key;
- struct btrfs_path *log_path;
- int i;
+ BTRFS_PATH_AUTO_FREE(log_path);
+ const u64 ino = wc->log_key.objectid;
int nritems;
int ret;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search xattrs for inode %llu root %llu",
+ ino, btrfs_root_id(root));
goto out;
+ }
process_leaf:
- nritems = btrfs_header_nritems(path->nodes[0]);
- for (i = path->slots[0]; i < nritems; i++) {
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ for (int i = wc->subvol_path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
- btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &key, i);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0;
goto out;
}
- di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size(path->nodes[0], i);
+ di = btrfs_item_ptr(wc->subvol_path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size(wc->subvol_path->nodes[0], i);
cur = 0;
while (cur < total_size) {
- u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
- u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u16 name_len = btrfs_dir_name_len(wc->subvol_path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(wc->subvol_path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
+ btrfs_abort_log_replay(wc, ret,
+ "failed to allocate memory for name of length %u",
+ name_len);
goto out;
}
- read_extent_buffer(path->nodes[0], name,
+ read_extent_buffer(wc->subvol_path->nodes[0], name,
(unsigned long)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
@@ -2280,40 +2570,59 @@ process_leaf:
btrfs_release_path(log_path);
if (!log_di) {
/* Doesn't exist in log tree, so delete it. */
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, ino,
+ btrfs_release_path(wc->subvol_path);
+ di = btrfs_lookup_xattr(trans, root, wc->subvol_path, ino,
name, name_len, -1);
- kfree(name);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
- path, di);
- if (ret)
+ wc->subvol_path, di);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to delete xattr with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(wc->subvol_path);
+ kfree(name);
search_key = key;
goto again;
}
- kfree(name);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup xattr in log tree with name %.*s for inode %llu root %llu",
+ name_len, name, ino,
+ btrfs_root_id(root));
+ kfree(name);
goto out;
}
+ kfree(name);
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(root, wc->subvol_path);
if (ret > 0)
ret = 0;
else if (ret == 0)
goto process_leaf;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
out:
- btrfs_free_path(log_path);
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
return ret;
}
@@ -2328,12 +2637,11 @@ out:
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
-static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *log,
- struct btrfs_path *path,
+static noinline int replay_dir_deletes(struct walk_control *wc,
u64 dirid, bool del_all)
{
+ struct btrfs_root *root = wc->root;
+ struct btrfs_root *log = (del_all ? NULL : wc->log);
u64 range_start;
u64 range_end;
int ret = 0;
@@ -2345,8 +2653,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
- if (!log_path)
+ if (!log_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
dir = btrfs_iget_logging(dirid, root);
/*
@@ -2358,6 +2668,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir);
if (ret == -ENOENT)
ret = 0;
+ else
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup dir inode %llu root %llu",
+ dirid, btrfs_root_id(root));
return ret;
}
@@ -2367,32 +2681,46 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid,
+ ret = find_dir_range(log, wc->subvol_path, dirid,
&range_start, &range_end);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to find range for dir %llu in log tree root %llu",
+ dirid, btrfs_root_id(root));
goto out;
- else if (ret > 0)
+ } else if (ret > 0) {
break;
+ }
}
dir_key.offset = range_start;
while (1) {
int nritems;
- ret = btrfs_search_slot(NULL, root, &dir_key, path,
- 0, 0);
- if (ret < 0)
+ ret = btrfs_search_slot(NULL, root, &dir_key,
+ wc->subvol_path, 0, 0);
+ if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to search root %llu for key (%llu %u %llu)",
+ btrfs_root_id(root),
+ dir_key.objectid, dir_key.type,
+ dir_key.offset);
goto out;
+ }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ nritems = btrfs_header_nritems(wc->subvol_path->nodes[0]);
+ if (wc->subvol_path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, wc->subvol_path);
+ if (ret == 1) {
break;
- else if (ret < 0)
+ } else if (ret < 0) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to get next leaf in subvolume root %llu",
+ btrfs_root_id(root));
goto out;
+ }
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ btrfs_item_key_to_cpu(wc->subvol_path->nodes[0], &found_key,
+ wc->subvol_path->slots[0]);
if (found_key.objectid != dirid ||
found_key.type != dir_key.type) {
ret = 0;
@@ -2402,23 +2730,21 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, log, path,
- log_path, dir,
- &found_key);
+ ret = check_item_in_log(wc, log_path, dir, &found_key, del_all);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
ret = 0;
out:
- btrfs_release_path(path);
+ btrfs_release_path(wc->subvol_path);
btrfs_free_path(log_path);
iput(&dir->vfs_inode);
return ret;
@@ -2435,7 +2761,7 @@ out:
* only in the log (references come from either directory items or inode
* back refs).
*/
-static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+static int replay_one_buffer(struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
@@ -2443,33 +2769,44 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
.transid = gen,
.level = level
};
- struct btrfs_path *path;
- struct btrfs_root *root = wc->replay_dest;
- struct btrfs_key key;
- int i;
+ struct btrfs_root *root = wc->root;
+ struct btrfs_trans_handle *trans = wc->trans;
int ret;
- ret = btrfs_read_extent_buffer(eb, &check);
- if (ret)
- return ret;
-
- level = btrfs_header_level(eb);
-
if (level != 0)
return 0;
- path = btrfs_alloc_path();
- if (!path)
+ /*
+ * Set to NULL since it was not yet read and in case we abort log replay
+ * on error, we have no valid log tree leaf to dump.
+ */
+ wc->log_leaf = NULL;
+ ret = btrfs_read_extent_buffer(eb, &check);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to read log tree leaf %llu for root %llu",
+ eb->start, btrfs_root_id(root));
+ return ret;
+ }
+
+ ASSERT(wc->subvol_path == NULL);
+ wc->subvol_path = btrfs_alloc_path();
+ if (!wc->subvol_path) {
+ btrfs_abort_log_replay(wc, -ENOMEM, "failed to allocate path");
return -ENOMEM;
+ }
+
+ wc->log_leaf = eb;
nritems = btrfs_header_nritems(eb);
- for (i = 0; i < nritems; i++) {
+ for (wc->log_slot = 0; wc->log_slot < nritems; wc->log_slot++) {
struct btrfs_inode_item *inode_item;
- btrfs_item_key_to_cpu(eb, &key, i);
+ btrfs_item_key_to_cpu(eb, &wc->log_key, wc->log_slot);
- if (key.type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, wc->log_slot,
+ struct btrfs_inode_item);
/*
* An inode with no links is either:
*
@@ -2498,22 +2835,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
}
/* Inode keys are done during the first stage. */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
+ if (wc->log_key.type == BTRFS_INODE_ITEM_KEY &&
wc->stage == LOG_WALK_REPLAY_INODES) {
u32 mode;
- ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid);
+ ret = replay_xattr_deletes(wc);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans, root, log, path,
- key.objectid, false);
+ ret = replay_dir_deletes(wc, wc->log_key.objectid, false);
if (ret)
break;
}
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ ret = overwrite_item(wc);
if (ret)
break;
@@ -2530,9 +2865,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct btrfs_inode *inode;
u64 from;
- inode = btrfs_iget_logging(key.objectid, root);
+ inode = btrfs_iget_logging(wc->log_key.objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
+ btrfs_abort_log_replay(wc, ret,
+ "failed to lookup inode %llu root %llu",
+ wc->log_key.objectid,
+ btrfs_root_id(root));
break;
}
from = ALIGN(i_size_read(&inode->vfs_inode),
@@ -2540,21 +2879,31 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root, inode,
- &drop_args);
- if (!ret) {
+ drop_args.path = wc->subvol_path;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_log_replay(wc, ret,
+ "failed to drop extents for inode %llu root %llu offset %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root),
+ from);
+ } else {
inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans, inode);
+ ret = btrfs_update_inode(trans, inode);
+ if (ret)
+ btrfs_abort_log_replay(wc, ret,
+ "failed to update inode %llu root %llu",
+ btrfs_ino(inode),
+ btrfs_root_id(root));
}
iput(&inode->vfs_inode);
if (ret)
break;
}
- ret = link_to_fixup_dir(wc->trans, root,
- path, key.objectid);
+ ret = link_to_fixup_dir(wc, wc->log_key.objectid);
if (ret)
break;
}
@@ -2562,10 +2911,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
if (wc->ignore_cur_inode)
continue;
- if (key.type == BTRFS_DIR_INDEX_KEY &&
+ if (wc->log_key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
+ ret = replay_one_dir_item(wc);
if (ret)
break;
}
@@ -2574,20 +2922,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
continue;
/* these keys are simply copied */
- if (key.type == BTRFS_XATTR_ITEM_KEY) {
- ret = overwrite_item(wc->trans, root, path,
- eb, i, &key);
+ if (wc->log_key.type == BTRFS_XATTR_ITEM_KEY) {
+ ret = overwrite_item(wc);
if (ret)
break;
- } else if (key.type == BTRFS_INODE_REF_KEY ||
- key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = add_inode_ref(wc->trans, root, log, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_INODE_REF_KEY ||
+ wc->log_key.type == BTRFS_INODE_EXTREF_KEY) {
+ ret = add_inode_ref(wc);
if (ret)
break;
- } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
- ret = replay_one_extent(wc->trans, root, path,
- eb, i, &key);
+ } else if (wc->log_key.type == BTRFS_EXTENT_DATA_KEY) {
+ ret = replay_one_extent(wc);
if (ret)
break;
}
@@ -2598,55 +2943,55 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* older kernel with such keys, ignore them.
*/
}
- btrfs_free_path(path);
+ btrfs_free_path(wc->subvol_path);
+ wc->subvol_path = NULL;
return ret;
}
-/*
- * Correctly adjust the reserved bytes occupied by a log tree extent buffer
- */
-static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
-{
- struct btrfs_block_group *cache;
-
- cache = btrfs_lookup_block_group(fs_info, start);
- if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
- return -ENOENT;
- }
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->reserved -= fs_info->nodesize;
- cache->space_info->bytes_reserved -= fs_info->nodesize;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- btrfs_put_block_group(cache);
-
- return 0;
-}
-
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_block_group *bg;
+
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
wait_on_extent_buffer_writeback(eb);
btrfs_tree_unlock(eb);
- if (trans)
- return btrfs_pin_reserved_extent(trans, eb);
+ if (trans) {
+ int ret;
- return unaccount_log_buffer(eb->fs_info, eb->start);
+ ret = btrfs_pin_reserved_extent(trans, eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ bg = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!bg) {
+ btrfs_err(fs_info, "unable to find block group for %llu", eb->start);
+ btrfs_handle_fs_error(fs_info, -ENOENT, NULL);
+ return -ENOENT;
+ }
+
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->reserved -= fs_info->nodesize;
+ bg->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+
+ return 0;
}
-static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_down_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_trans_handle *trans = wc->trans;
+ struct btrfs_fs_info *fs_info = wc->log->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
@@ -2674,12 +3019,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = btrfs_find_create_tree_block(fs_info, bytenr,
btrfs_header_owner(cur),
*level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
+ if (IS_ERR(next)) {
+ ret = PTR_ERR(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ return ret;
+ }
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen,
- *level - 1);
+ ret = wc->process_func(next, wc, ptr_gen, *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2690,6 +3040,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2705,6 +3059,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
ret = btrfs_read_extent_buffer(next, &check);
if (ret) {
free_extent_buffer(next);
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
@@ -2721,10 +3079,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level,
- struct walk_control *wc)
+static noinline int walk_up_log_tree(struct btrfs_path *path, int *level,
+ struct walk_control *wc)
{
int i;
int slot;
@@ -2738,14 +3094,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- ret = wc->process_func(root, path->nodes[*level], wc,
+ ret = wc->process_func(path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
if (ret)
return ret;
if (wc->free) {
- ret = clean_log_buffer(trans, path->nodes[*level]);
+ ret = clean_log_buffer(wc->trans, path->nodes[*level]);
if (ret)
return ret;
}
@@ -2762,13 +3118,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
-static int walk_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *log, struct walk_control *wc)
+static int walk_log_tree(struct walk_control *wc)
{
+ struct btrfs_root *log = wc->log;
int ret = 0;
int wret;
int level;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int orig_level;
path = btrfs_alloc_path();
@@ -2782,36 +3138,30 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
path->slots[level] = 0;
while (1) {
- wret = walk_down_log_tree(trans, log, path, &level, wc);
+ wret = walk_down_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
- wret = walk_up_log_tree(trans, log, path, &level, wc);
+ wret = walk_up_log_tree(path, &level, wc);
if (wret > 0)
break;
- if (wret < 0) {
- ret = wret;
- goto out;
- }
+ if (wret < 0)
+ return wret;
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
- ret = wc->process_func(log, path->nodes[orig_level], wc,
+ ret = wc->process_func(path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]),
orig_level);
if (ret)
- goto out;
+ return ret;
if (wc->free)
- ret = clean_log_buffer(trans, path->nodes[orig_level]);
+ ret = clean_log_buffer(wc->trans, path->nodes[orig_level]);
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -3220,7 +3570,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
mutex_unlock(&fs_info->tree_log_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
@@ -3272,12 +3622,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
{
int ret;
struct walk_control wc = {
- .free = 1,
- .process_func = process_one_buffer
+ .free = true,
+ .process_func = process_one_buffer,
+ .log = log,
+ .trans = trans,
};
if (log->node) {
- ret = walk_log_tree(trans, log, &wc);
+ ret = walk_log_tree(&wc);
if (ret) {
/*
* We weren't able to traverse the entire log tree, the
@@ -3340,6 +3692,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
/*
* Check if an inode was logged in the current transaction. This correctly deals
* with the case where the inode was logged but has a logged_trans of 0, which
@@ -3357,15 +3734,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- if (inode->logged_trans == trans->transid)
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
return 1;
/*
- * If logged_trans is not 0, then we know the inode logged was not logged
- * in this transaction, so we can return false right away.
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
*/
- if (inode->logged_trans > 0)
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
return 0;
+ }
+ spin_unlock(&inode->lock);
/*
* If no log tree was created for this root in this transaction, then
@@ -3374,10 +3768,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* transaction's ID, to avoid the search below in a future call in case
* a log tree gets created after this.
*/
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
- inode->logged_trans = trans->transid - 1;
- return 0;
- }
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
/*
* We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3431,29 +3823,17 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* Set logged_trans to a value greater than 0 and less then the
* current transaction to avoid doing the search in future calls.
*/
- inode->logged_trans = trans->transid - 1;
- return 0;
+ return mark_inode_as_not_logged(trans, inode);
}
/*
* The inode was previously logged and then evicted, set logged_trans to
- * the current transacion's ID, to avoid future tree searches as long as
+ * the current transaction's ID, to avoid future tree searches as long as
* the inode is not evicted again.
*/
+ spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
-
- /*
- * If it's a directory, then we must set last_dir_index_offset to the
- * maximum possible value, so that the next attempt to log the inode does
- * not skip checking if dir index keys found in modified subvolume tree
- * leaves have been logged before, otherwise it would result in attempts
- * to insert duplicate dir index keys in the log tree. This must be done
- * because last_dir_index_offset is an in-memory only field, not persisted
- * in the inode item or any other on-disk structure, so its value is lost
- * once the inode is evicted.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode))
- inode->last_dir_index_offset = (u64)-1;
+ spin_unlock(&inode->lock);
return 1;
}
@@ -3519,13 +3899,13 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_inode *dir, u64 index)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
ret = inode_logged(trans, dir, NULL);
if (ret == 0)
return;
- else if (ret < 0) {
+ if (ret < 0) {
btrfs_set_log_full_commit(trans);
return;
}
@@ -3539,7 +3919,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
ret = join_running_log_trans(root);
ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
if (WARN_ON(ret))
- goto out;
+ return;
mutex_lock(&dir->log_mutex);
@@ -3549,8 +3929,6 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
if (ret < 0)
btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
-out:
- btrfs_free_path(path);
}
/* see comments for btrfs_del_dir_entries_in_log */
@@ -3663,8 +4041,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
struct btrfs_key *ins_keys;
u32 *ins_sizes;
- ins_data = kmalloc(count * sizeof(u32) +
- count * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(count, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
@@ -4045,7 +4422,7 @@ done:
/*
* If the inode was logged before and it was evicted, then its
- * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * last_dir_index_offset is 0, so we don't know the value of the last index
* key offset. If that's the case, search for it and update the inode. This
* is to avoid lookups in the log tree every time we try to insert a dir index
* key from a leaf changed in the current transaction, and to allow us to always
@@ -4061,7 +4438,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
lockdep_assert_held(&inode->log_mutex);
- if (inode->last_dir_index_offset != (u64)-1)
+ if (inode->last_dir_index_offset != 0)
return 0;
if (!ctx->logged_before) {
@@ -4227,7 +4604,7 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only,
+ struct inode *inode, bool log_inode_only,
u64 logged_isize)
{
u64 flags;
@@ -4323,7 +4700,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
- 0, 0);
+ false, 0);
btrfs_release_path(path);
return 0;
}
@@ -4427,8 +4804,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src = src_path->nodes[0];
- ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
- nr * sizeof(u32), GFP_NOFS);
+ ins_data = kmalloc_array(nr, sizeof(struct btrfs_key) + sizeof(u32), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
@@ -4829,7 +5205,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_key key;
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *dst_path = NULL;
+ BTRFS_PATH_AUTO_FREE(dst_path);
bool dropped_extents = false;
u64 truncate_offset = i_size;
struct extent_buffer *leaf;
@@ -4947,7 +5323,6 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
start_slot, ins_nr, 1, 0, ctx);
out:
btrfs_release_path(path);
- btrfs_free_path(dst_path);
return ret;
}
@@ -5320,7 +5695,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
u64 *other_ino, u64 *other_parent)
{
int ret;
- struct btrfs_path *search_path;
+ BTRFS_PATH_AUTO_FREE(search_path);
char *name = NULL;
u32 name_len = 0;
u32 item_size = btrfs_item_size(eb, slot);
@@ -5405,7 +5780,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
}
ret = 0;
out:
- btrfs_free_path(search_path);
kfree(name);
return ret;
}
@@ -6133,8 +6507,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
if (!first)
return 0;
- ins_data = kmalloc(max_batch_size * sizeof(u32) +
- max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ ins_data = kmalloc_array(max_batch_size, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
ins_sizes = (u32 *)ins_data;
@@ -6788,7 +7161,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
int ret;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
@@ -6804,7 +7177,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
@@ -6816,8 +7189,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -6875,10 +7248,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
* at both parents and the old parent B would still
* exist.
*/
- if (IS_ERR(dir_inode)) {
- ret = PTR_ERR(dir_inode);
- goto out;
- }
+ if (IS_ERR(dir_inode))
+ return PTR_ERR(dir_inode);
if (!need_log_inode(trans, dir_inode)) {
btrfs_add_delayed_iput(dir_inode);
@@ -6891,14 +7262,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = log_new_dir_dentries(trans, dir_inode, ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
- goto out;
+ return ret;
}
path->slots[0]++;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
static int log_new_ancestors(struct btrfs_trans_handle *trans,
@@ -7009,7 +7377,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret;
@@ -7030,7 +7398,7 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0)
path->slots[0]++;
@@ -7042,8 +7410,8 @@ again:
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
- else if (ret > 0)
+ return ret;
+ if (ret > 0)
break;
continue;
}
@@ -7060,10 +7428,8 @@ again:
* this loop, etc). So just return some error to fallback to
* a transaction commit.
*/
- if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
- ret = -EMLINK;
- goto out;
- }
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY)
+ return -EMLINK;
/*
* Logging ancestors needs to do more searches on the fs/subvol
@@ -7075,14 +7441,11 @@ again:
ret = log_new_ancestors(trans, root, path, ctx);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
goto again;
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -7262,10 +7625,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
}
wc.trans = trans;
- wc.pin = 1;
+ wc.pin = true;
+ wc.log = log_root_tree;
- ret = walk_log_tree(trans, log_root_tree, &wc);
- if (ret) {
+ ret = walk_log_tree(&wc);
+ wc.log = NULL;
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -7276,12 +7641,11 @@ again:
key.offset = (u64)-1;
while (1) {
- struct btrfs_root *log;
struct btrfs_key found_key;
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -7296,20 +7660,19 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_tree_root(log_root_tree, &found_key);
- if (IS_ERR(log)) {
- ret = PTR_ERR(log);
+ wc.log = btrfs_read_tree_root(log_root_tree, &found_key);
+ if (IS_ERR(wc.log)) {
+ ret = PTR_ERR(wc.log);
+ wc.log = NULL;
btrfs_abort_transaction(trans, ret);
goto error;
}
- wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
- true);
- if (IS_ERR(wc.replay_dest)) {
- ret = PTR_ERR(wc.replay_dest);
- wc.replay_dest = NULL;
- if (ret != -ENOENT) {
- btrfs_put_root(log);
+ wc.root = btrfs_get_fs_root(fs_info, found_key.offset, true);
+ if (IS_ERR(wc.root)) {
+ ret = PTR_ERR(wc.root);
+ wc.root = NULL;
+ if (unlikely(ret != -ENOENT)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -7325,33 +7688,34 @@ again:
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
- ret = btrfs_pin_extent_for_log_replay(trans, log->node);
- if (ret) {
- btrfs_put_root(log);
+ ret = btrfs_pin_extent_for_log_replay(trans, wc.log->node);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error;
}
goto next;
}
- wc.replay_dest->log_root = log;
- ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret) {
+ wc.root->log_root = wc.log;
+ ret = btrfs_record_root_in_trans(trans, wc.root);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto next;
}
- ret = walk_log_tree(trans, log, &wc);
- if (ret) {
+ ret = walk_log_tree(&wc);
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto next;
}
if (wc.stage == LOG_WALK_REPLAY_ALL) {
- struct btrfs_root *root = wc.replay_dest;
+ struct btrfs_root *root = wc.root;
- ret = fixup_inode_link_counts(trans, wc.replay_dest, path);
- if (ret) {
+ wc.subvol_path = path;
+ ret = fixup_inode_link_counts(&wc);
+ wc.subvol_path = NULL;
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto next;
}
@@ -7364,17 +7728,18 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto next;
}
}
next:
- if (wc.replay_dest) {
- wc.replay_dest->log_root = NULL;
- btrfs_put_root(wc.replay_dest);
+ if (wc.root) {
+ wc.root->log_root = NULL;
+ btrfs_put_root(wc.root);
}
- btrfs_put_root(log);
+ btrfs_put_root(wc.log);
+ wc.log = NULL;
if (ret)
goto error;
@@ -7386,7 +7751,7 @@ next:
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
- wc.pin = 0;
+ wc.pin = false;
wc.process_func = replay_one_buffer;
wc.stage = LOG_WALK_REPLAY_INODES;
goto again;
@@ -7404,14 +7769,13 @@ next:
if (ret)
return ret;
- log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- btrfs_put_root(log_root_tree);
return 0;
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ btrfs_put_root(wc.log);
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index b7a96a005487..46bd8ca58670 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -487,12 +487,12 @@ static int rollback_verity(struct btrfs_inode *inode)
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = del_orphan(trans, inode);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -676,11 +676,11 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
if (ret < 0)
return ret;
- if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ if (unlikely(item.reserved[0] != 0 || item.reserved[1] != 0))
return -EUCLEAN;
true_size = btrfs_stack_verity_descriptor_size(&item);
- if (true_size > INT_MAX)
+ if (unlikely(true_size > INT_MAX))
return -EUCLEAN;
if (buf_size == 0)
@@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations btrfs_verityops = {
+ .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) -
+ (int)offsetof(struct btrfs_inode, vfs_inode),
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa7a929a0461..2bec544d8ba3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1377,8 +1377,8 @@ struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
}
/*
- * Make sure the last byte of label is properly NUL termiated. We use
- * '%s' to print the label, if not properly NUL termiated we can access
+ * Make sure the last byte of label is properly NUL terminated. We use
+ * '%s' to print the label, if not properly NUL terminated we can access
* beyond the label.
*/
if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
@@ -1911,7 +1911,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
if (ret < 0)
goto error;
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/* Corruption */
btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
ret = -EUCLEAN;
@@ -2243,7 +2243,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
ret = btrfs_rm_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
/* Any error in dev item removal is critical */
btrfs_crit(fs_info,
"failed to remove device item for devid %llu: %d",
@@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error;
}
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = true;
down_write(&sb->s_umount);
@@ -2838,21 +2843,21 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_lock(&fs_info->chunk_mutex);
ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
}
ret = btrfs_add_dev_item(trans, device);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
if (seeding_dev) {
ret = btrfs_finish_sprout(trans);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
@@ -3044,7 +3049,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
- else if (ret > 0) { /* Logic error or corruption */
+ else if (unlikely(ret > 0)) { /* Logic error or corruption */
btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
chunk_offset);
btrfs_abort_transaction(trans, -ENOENT);
@@ -3053,7 +3058,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3278,7 +3283,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
- if (ret) {
+ if (unlikely(ret)) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3348,7 +3353,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
struct btrfs_space_info *space_info;
space_info = btrfs_find_space_info(fs_info, sys_flags);
- if (!space_info) {
+ if (unlikely(!space_info)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3362,17 +3367,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = remove_chunk_item(trans, map, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- } else if (ret) {
+ } else if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3381,7 +3386,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3397,7 +3402,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
btrfs_trans_release_chunk_metadata(trans);
ret = btrfs_remove_block_group(trans, map);
- if (ret) {
+ if (unlikely(ret)) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3522,7 +3527,7 @@ again:
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
- if (ret == 0) {
+ if (unlikely(ret == 0)) {
/*
* On the first search we would find chunk tree with
* offset -1, which is not possible. On subsequent
@@ -4264,7 +4269,7 @@ error:
* @flags: profile to validate
* @extended: if true @flags is treated as an extended profile
*/
-static int alloc_profile_is_valid(u64 flags, int extended)
+static int alloc_profile_is_valid(u64 flags, bool extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -4458,7 +4463,7 @@ out_overflow:
}
/*
- * Should be called with balance mutexe held
+ * Should be called with balance mutex held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
@@ -5036,7 +5041,7 @@ again:
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
btrfs_trans_release_chunk_metadata(trans);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
@@ -5696,7 +5701,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
- if (!chunk) {
+ if (unlikely(!chunk)) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7481,7 +7486,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
/*
* Lockdep complains about possible circular locking dependency between
* a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
- * used for freeze procection of a fs (struct super_block.s_writers),
+ * used for freeze protection of a fs (struct super_block.s_writers),
* which we take when starting a transaction, and extent buffers of the
* chunk tree if we call read_one_dev() while holding a lock on an
* extent buffer of the chunk tree. Since we are mounting the filesystem
@@ -7914,8 +7919,6 @@ int btrfs_bg_type_to_factor(u64 flags)
return btrfs_raid_array[index].ncopies;
}
-
-
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
@@ -7929,7 +7932,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
int i;
map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
- if (!map) {
+ if (unlikely(!map)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7938,7 +7941,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
stripe_len = btrfs_calc_stripe_length(map);
- if (physical_len != stripe_len) {
+ if (unlikely(physical_len != stripe_len)) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
physical_offset, devid, map->start, physical_len,
@@ -7958,8 +7961,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
devid, physical_offset, physical_len);
for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].dev->devid == devid &&
- map->stripes[i].physical == physical_offset) {
+ if (unlikely(map->stripes[i].dev->devid == devid &&
+ map->stripes[i].physical == physical_offset)) {
found = true;
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
@@ -7972,7 +7975,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
break;
}
}
- if (!found) {
+ if (unlikely(!found)) {
btrfs_err(fs_info,
"dev extent physical offset %llu devid %llu has no corresponding chunk",
physical_offset, devid);
@@ -7981,13 +7984,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!dev) {
+ if (unlikely(!dev)) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- if (physical_offset + physical_len > dev->disk_total_bytes) {
+ if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
devid, physical_offset, physical_len,
@@ -7999,8 +8002,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (dev->zone_info) {
u64 zone_size = dev->zone_info->zone_size;
- if (!IS_ALIGNED(physical_offset, zone_size) ||
- !IS_ALIGNED(physical_len, zone_size)) {
+ if (unlikely(!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size))) {
btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
devid, physical_offset, physical_len);
@@ -8024,7 +8027,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
struct btrfs_chunk_map *map;
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
- if (map->num_stripes != map->verified_stripes) {
+ if (unlikely(map->num_stripes != map->verified_stripes)) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
map->start, map->verified_stripes, map->num_stripes);
@@ -8084,7 +8087,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
- if (ret > 0) {
+ if (unlikely(ret > 0)) {
ret = -EUCLEAN;
goto out;
}
@@ -8109,7 +8112,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
physical_len = btrfs_dev_extent_length(leaf, dext);
/* Check if this dev extent overlaps with the previous one */
- if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+ if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
devid, physical_offset, prev_dev_ext_end);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a56e873a3029..2cbf8080eade 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -34,7 +34,7 @@ struct btrfs_zoned_device_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
/*
- * Arbitratry maximum size of one discard request to limit potentially long time
+ * Arbitrary maximum size of one discard request to limit potentially long time
* spent in blkdev_issue_discard().
*/
#define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G)
@@ -495,7 +495,7 @@ struct btrfs_discard_stripe {
};
/*
- * Context for IO subsmission for device stripe.
+ * Context for IO submission for device stripe.
*
* - Track the unfinished mirrors for mirror based profiles
* Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 5292cd341f70..6caba8be7c84 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -34,11 +34,9 @@ struct workspace {
int level;
};
-static struct workspace_manager wsm;
-
-struct list_head *zlib_get_workspace(unsigned int level)
+struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
{
- struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
+ struct list_head *ws = btrfs_get_workspace(fs_info, BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
@@ -55,8 +53,25 @@ void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zlib_alloc_workspace(unsigned int level)
+/*
+ * For s390 hardware acceleration, the buffer size should be at least
+ * ZLIB_DFLTCC_BUF_SIZE to achieve the best performance.
+ *
+ * But if bs > ps we can have large enough folios that meet the s390 hardware
+ * handling.
+ */
+static bool need_special_buffer(struct btrfs_fs_info *fs_info)
+{
+ if (!zlib_deflate_dfltcc_enabled())
+ return false;
+ if (btrfs_min_folio_size(fs_info) >= ZLIB_DFLTCC_BUF_SIZE)
+ return false;
+ return true;
+}
+
+struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level)
{
+ const u32 blocksize = fs_info->sectorsize;
struct workspace *workspace;
int workspacesize;
@@ -69,19 +84,15 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN);
workspace->level = level;
workspace->buf = NULL;
- /*
- * In case of s390 zlib hardware support, allocate lager workspace
- * buffer. If allocator fails, fall back to a single page buffer.
- */
- if (zlib_deflate_dfltcc_enabled()) {
+ if (need_special_buffer(fs_info)) {
workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN | GFP_NOIO);
workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
}
if (!workspace->buf) {
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- workspace->buf_size = PAGE_SIZE;
+ workspace->buf = kmalloc(blocksize, GFP_KERNEL);
+ workspace->buf_size = blocksize;
}
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -133,11 +144,15 @@ static int copy_data_into_buffer(struct address_space *mapping,
return 0;
}
-int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret;
char *data_in = NULL;
char *cfolio_out;
@@ -146,7 +161,8 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
struct folio *out_folio = NULL;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
- const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const unsigned long max_out = nr_dest_folios << min_folio_shift;
+ const u32 blocksize = fs_info->sectorsize;
const u64 orig_end = start + len;
*out_folios = 0;
@@ -155,9 +171,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflateInit(&workspace->strm, workspace->level);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zlib compression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
@@ -167,7 +181,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -179,7 +193,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->strm.next_in = workspace->buf;
workspace->strm.avail_in = 0;
workspace->strm.next_out = cfolio_out;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
while (workspace->strm.total_in < len) {
/*
@@ -191,10 +205,11 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned int copy_length = min(bytes_left, workspace->buf_size);
/*
- * This can only happen when hardware zlib compression is
- * enabled.
+ * For s390 hardware accelerated zlib, and our folio is smaller
+ * than the copy_length, we need to fill the buffer so that
+ * we can take full advantage of hardware acceleration.
*/
- if (copy_length > PAGE_SIZE) {
+ if (need_special_buffer(fs_info)) {
ret = copy_data_into_buffer(mapping, workspace,
start, copy_length);
if (ret < 0)
@@ -225,9 +240,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_warn(inode->root->fs_info,
+ btrfs_warn(fs_info,
"zlib compression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
start);
@@ -237,7 +250,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
}
/* we're making it bigger, give up */
- if (workspace->strm.total_in > 8192 &&
+ if (workspace->strm.total_in > blocksize * 2 &&
workspace->strm.total_in <
workspace->strm.total_out) {
ret = -E2BIG;
@@ -252,7 +265,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -260,7 +273,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
/* we're all done */
@@ -278,7 +291,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = zlib_deflate(&workspace->strm, Z_FINISH);
if (ret == Z_STREAM_END)
break;
- if (ret != Z_OK && ret != Z_BUF_ERROR) {
+ if (unlikely(ret != Z_OK && ret != Z_BUF_ERROR)) {
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
@@ -288,7 +301,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -296,7 +309,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
cfolio_out = folio_address(out_folio);
folios[nr_folios] = out_folio;
nr_folios++;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = min_folio_size;
workspace->strm.next_out = cfolio_out;
}
}
@@ -322,20 +335,22 @@ out:
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
int ret = 0, ret2;
int wbits = MAX_WBITS;
char *data_in;
size_t total_out = 0;
unsigned long folio_in_index = 0;
size_t srclen = cb->compressed_len;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
struct folio **folios_in = cb->compressed_folios;
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
- workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size);
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
@@ -396,7 +411,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
data_in = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
- workspace->strm.avail_in = min(tmp, PAGE_SIZE);
+ workspace->strm.avail_in = min(tmp, min_folio_size);
}
}
if (unlikely(ret != Z_STREAM_END)) {
@@ -484,8 +499,7 @@ out:
return ret;
}
-const struct btrfs_compress_op btrfs_zlib_compress = {
- .workspace_manager = &wsm,
+const struct btrfs_compress_levels btrfs_zlib_compress = {
.min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index ea662036f441..e00036672f33 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -274,7 +274,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
return ret;
}
*nr_zones = ret;
- if (!ret)
+ if (unlikely(!ret))
return -EIO;
/* Populate cache */
@@ -315,7 +315,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
/* No dev extents at all? Not good */
- if (ret > 0)
+ if (unlikely(ret > 0))
return -EUCLEAN;
}
@@ -503,7 +503,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
}
- if (nreported != zone_info->nr_zones) {
+ if (unlikely(nreported != zone_info->nr_zones)) {
btrfs_err(device->fs_info,
"inconsistent number of zones on %s (%u/%u)",
rcu_dereference(device->name), nreported,
@@ -513,7 +513,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
if (max_active_zones) {
- if (nactive > max_active_zones) {
+ if (unlikely(nactive > max_active_zones)) {
+ if (bdev_max_active_zones(bdev) == 0) {
+ max_active_zones = 0;
+ zone_info->max_active_zones = 0;
+ goto validate;
+ }
btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
nactive, rcu_dereference(device->name),
@@ -526,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
+validate:
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -544,7 +550,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (ret)
goto out;
- if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) {
btrfs_err(device->fs_info,
"zoned: failed to read super block log zone info at devid %llu zone %u",
device->devid, sb_zone);
@@ -562,7 +568,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
ret = sb_write_pointer(device->bdev,
&zone_info->sb_zones[sb_pos], &sb_wp);
- if (ret != -ENOENT && ret) {
+ if (unlikely(ret != -ENOENT && ret)) {
btrfs_err(device->fs_info,
"zoned: super block log zone corrupted devid %llu zone %u",
device->devid, sb_zone);
@@ -895,7 +901,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
zones);
if (ret < 0)
return ret;
- if (ret != BTRFS_NR_SB_LOG_ZONES)
+ if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES))
return -EIO;
return sb_log_location(bdev, zones, rw, bytenr_ret);
@@ -1247,7 +1253,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
root = btrfs_extent_root(fs_info, key.objectid);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* We should not find the exact match */
- if (!ret)
+ if (unlikely(!ret))
ret = -EUCLEAN;
if (ret < 0)
return ret;
@@ -1268,8 +1274,8 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
else
length = fs_info->nodesize;
- if (!(found_key.objectid >= cache->start &&
- found_key.objectid + length <= cache->start + cache->length)) {
+ if (unlikely(!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length))) {
return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
@@ -1351,7 +1357,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) {
btrfs_err(fs_info,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
@@ -1393,7 +1399,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
struct zone_info *info,
unsigned long *active)
{
- if (info->alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(info->alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
info->physical);
@@ -1422,13 +1428,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
- if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[0].physical);
return -EIO;
}
- if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
zone_info[1].physical);
@@ -1441,14 +1447,14 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
zone_info[1].alloc_offset = last_alloc;
- if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+ if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
return -EIO;
}
if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else if (test_bit(0, active)) {
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
@@ -1483,16 +1489,16 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
zone_info[i].alloc_offset = last_alloc;
- if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
+ if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED))) {
btrfs_err(fs_info,
"zoned: write pointer offset mismatch of zones in %s profile",
btrfs_bg_type_to_raid_name(map->type));
return -EIO;
}
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_test_opt(fs_info, DEGRADED) &&
- !btrfs_zone_activate(bg)) {
+ if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg))) {
return -EIO;
}
} else {
@@ -1548,7 +1554,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
@@ -1580,7 +1586,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
continue;
if (test_bit(0, active) != test_bit(i, active)) {
- if (!btrfs_zone_activate(bg))
+ if (unlikely(!btrfs_zone_activate(bg)))
return -EIO;
} else {
if (test_bit(0, active))
@@ -1637,7 +1643,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return 0;
/* Sanity check */
- if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) {
btrfs_err(fs_info,
"zoned: block group %llu len %llu unaligned to zone size %llu",
logical, length, fs_info->zone_size);
@@ -1750,7 +1756,7 @@ out:
return -EINVAL;
}
- if (cache->alloc_offset > cache->zone_capacity) {
+ if (unlikely(cache->alloc_offset > cache->zone_capacity)) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
cache->alloc_offset, cache->zone_capacity,
@@ -2081,7 +2087,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc, NULL, NULL);
- if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) {
ret = -EIO;
goto out_put_bioc;
}
@@ -2139,7 +2145,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
if (physical_pos == wp)
return 0;
- if (physical_pos > wp)
+ if (unlikely(physical_pos > wp))
return -EUCLEAN;
length = wp - physical_pos;
@@ -2458,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
return ret;
}
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
- return;
+ return 0;
block_group = btrfs_lookup_block_group(fs_info, logical);
- ASSERT(block_group);
+ if (WARN_ON_ONCE(!block_group))
+ return -ENOENT;
/* No MIXED_BG on zoned btrfs. */
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
@@ -2484,16 +2491,21 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
out:
btrfs_put_block_group(block_group);
+ return 0;
}
static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
{
+ int ret;
struct btrfs_block_group *bg =
container_of(work, struct btrfs_block_group, zone_finish_work);
wait_on_extent_buffer_writeback(bg->last_eb);
free_extent_buffer(bg->last_eb);
- btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ ret = do_zone_finish(bg, true);
+ if (ret)
+ btrfs_handle_fs_error(bg->fs_info, ret,
+ "Failed to finish block-group's zone");
btrfs_put_block_group(bg);
}
@@ -2515,7 +2527,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
- queue_work(system_unbound_wq, &bg->zone_finish_work);
+ queue_work(system_dfl_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2582,9 +2594,9 @@ again:
spin_lock(&space_info->lock);
space_info->total_bytes -= bg->length;
space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
/* There is no allocation ever happened. */
ASSERT(bg->used == 0);
- ASSERT(bg->zone_unusable == 0);
/* No super block in a block group on the zoned setup. */
ASSERT(bg->bytes_super == 0);
spin_unlock(&space_info->lock);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 6e11533b8e14..17c5656580dd 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
@@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
return true;
}
-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length) { }
+static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return 0;
+}
static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb) { }
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index ff0292615e1f..c9cddcfa337b 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -77,7 +77,6 @@ struct workspace {
*/
struct zstd_workspace_manager {
- const struct btrfs_compress_op *ops;
spinlock_t lock;
struct list_head lru_list;
struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
@@ -86,8 +85,6 @@ struct zstd_workspace_manager {
struct timer_list timer;
};
-static struct zstd_workspace_manager wsm;
-
static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
static inline struct workspace *list_to_workspace(struct list_head *list)
@@ -112,19 +109,19 @@ static inline int clip_level(int level)
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
+ struct zstd_workspace_manager *zwsm =
+ container_of(timer, struct zstd_workspace_manager, timer);
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- ASSERT(timer == &wsm.timer);
-
- spin_lock(&wsm.lock);
+ spin_lock(&zwsm->lock);
- if (list_empty(&wsm.lru_list)) {
- spin_unlock(&wsm.lock);
+ if (list_empty(&zwsm->lru_list)) {
+ spin_unlock(&zwsm->lock);
return;
}
- list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ list_for_each_prev_safe(pos, next, &zwsm->lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
int level;
@@ -141,15 +138,15 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_del(&victim->list);
zstd_free_workspace(&victim->list);
- if (list_empty(&wsm.idle_ws[level]))
- clear_bit(level, &wsm.active_map);
+ if (list_empty(&zwsm->idle_ws[level]))
+ clear_bit(level, &zwsm->active_map);
}
- if (!list_empty(&wsm.lru_list))
- mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ if (!list_empty(&zwsm->lru_list))
+ mod_timer(&zwsm->timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock(&wsm.lock);
+ spin_unlock(&zwsm->lock);
}
/*
@@ -182,49 +179,56 @@ static void zstd_calc_ws_mem_sizes(void)
}
}
-void zstd_init_workspace_manager(void)
+int zstd_alloc_workspace_manager(struct btrfs_fs_info *fs_info)
{
+ struct zstd_workspace_manager *zwsm;
struct list_head *ws;
- int i;
+ ASSERT(fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] == NULL);
+ zwsm = kzalloc(sizeof(*zwsm), GFP_KERNEL);
+ if (!zwsm)
+ return -ENOMEM;
zstd_calc_ws_mem_sizes();
+ spin_lock_init(&zwsm->lock);
+ init_waitqueue_head(&zwsm->wait);
+ timer_setup(&zwsm->timer, zstd_reclaim_timer_fn, 0);
- wsm.ops = &btrfs_zstd_compress;
- spin_lock_init(&wsm.lock);
- init_waitqueue_head(&wsm.wait);
- timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
-
- INIT_LIST_HEAD(&wsm.lru_list);
- for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
- INIT_LIST_HEAD(&wsm.idle_ws[i]);
+ INIT_LIST_HEAD(&zwsm->lru_list);
+ for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&zwsm->idle_ws[i]);
+ fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = zwsm;
- ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ ws = zstd_alloc_workspace(fs_info, ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
btrfs_warn(NULL, "cannot preallocate zstd compression workspace");
} else {
- set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
- list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &zwsm->active_map);
+ list_add(ws, &zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
}
+ return 0;
}
-void zstd_cleanup_workspace_manager(void)
+void zstd_free_workspace_manager(struct btrfs_fs_info *fs_info)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct workspace *workspace;
- int i;
- spin_lock_bh(&wsm.lock);
- for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
- while (!list_empty(&wsm.idle_ws[i])) {
- workspace = container_of(wsm.idle_ws[i].next,
+ if (!zwsm)
+ return;
+ fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD] = NULL;
+ spin_lock_bh(&zwsm->lock);
+ for (int i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&zwsm->idle_ws[i])) {
+ workspace = container_of(zwsm->idle_ws[i].next,
struct workspace, list);
list_del(&workspace->list);
list_del(&workspace->lru_list);
zstd_free_workspace(&workspace->list);
}
}
- spin_unlock_bh(&wsm.lock);
-
- timer_delete_sync(&wsm.timer);
+ spin_unlock_bh(&zwsm->lock);
+ timer_delete_sync(&zwsm->timer);
+ kfree(zwsm);
}
/*
@@ -239,29 +243,31 @@ void zstd_cleanup_workspace_manager(void)
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
-static struct list_head *zstd_find_workspace(int level)
+static struct list_head *zstd_find_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct list_head *ws;
struct workspace *workspace;
int i = clip_level(level);
- spin_lock_bh(&wsm.lock);
- for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
- if (!list_empty(&wsm.idle_ws[i])) {
- ws = wsm.idle_ws[i].next;
+ ASSERT(zwsm);
+ spin_lock_bh(&zwsm->lock);
+ for_each_set_bit_from(i, &zwsm->active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&zwsm->idle_ws[i])) {
+ ws = zwsm->idle_ws[i].next;
workspace = list_to_workspace(ws);
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
if (clip_level(level) == workspace->level)
list_del(&workspace->lru_list);
- if (list_empty(&wsm.idle_ws[i]))
- clear_bit(i, &wsm.active_map);
- spin_unlock_bh(&wsm.lock);
+ if (list_empty(&zwsm->idle_ws[i]))
+ clear_bit(i, &zwsm->active_map);
+ spin_unlock_bh(&zwsm->lock);
return ws;
}
}
- spin_unlock_bh(&wsm.lock);
+ spin_unlock_bh(&zwsm->lock);
return NULL;
}
@@ -276,30 +282,33 @@ static struct list_head *zstd_find_workspace(int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-struct list_head *zstd_get_workspace(int level)
+struct list_head *zstd_get_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct list_head *ws;
unsigned int nofs_flag;
+ ASSERT(zwsm);
+
/* level == 0 means we can use any workspace */
if (!level)
level = 1;
again:
- ws = zstd_find_workspace(level);
+ ws = zstd_find_workspace(fs_info, level);
if (ws)
return ws;
nofs_flag = memalloc_nofs_save();
- ws = zstd_alloc_workspace(level);
+ ws = zstd_alloc_workspace(fs_info, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ws)) {
DEFINE_WAIT(wait);
- prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&zwsm->wait, &wait, TASK_UNINTERRUPTIBLE);
schedule();
- finish_wait(&wsm.wait, &wait);
+ finish_wait(&zwsm->wait, &wait);
goto again;
}
@@ -318,34 +327,36 @@ again:
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
-void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct btrfs_fs_info *fs_info, struct list_head *ws)
{
+ struct zstd_workspace_manager *zwsm = fs_info->compr_wsm[BTRFS_COMPRESS_ZSTD];
struct workspace *workspace = list_to_workspace(ws);
- spin_lock_bh(&wsm.lock);
+ ASSERT(zwsm);
+ spin_lock_bh(&zwsm->lock);
/* A node is only taken off the lru if we are the corresponding level */
if (clip_level(workspace->req_level) == workspace->level) {
/* Hide a max level workspace from reclaim */
- if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ if (list_empty(&zwsm->idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
} else {
workspace->last_used = jiffies;
- list_add(&workspace->lru_list, &wsm.lru_list);
- if (!timer_pending(&wsm.timer))
- mod_timer(&wsm.timer,
+ list_add(&workspace->lru_list, &zwsm->lru_list);
+ if (!timer_pending(&zwsm->timer))
+ mod_timer(&zwsm->timer,
jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
}
}
- set_bit(workspace->level, &wsm.active_map);
- list_add(&workspace->list, &wsm.idle_ws[workspace->level]);
+ set_bit(workspace->level, &zwsm->active_map);
+ list_add(&workspace->list, &zwsm->idle_ws[workspace->level]);
workspace->req_level = 0;
- spin_unlock_bh(&wsm.lock);
+ spin_unlock_bh(&zwsm->lock);
if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
- cond_wake_up(&wsm.wait);
+ cond_wake_up(&zwsm->wait);
}
void zstd_free_workspace(struct list_head *ws)
@@ -357,8 +368,9 @@ void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zstd_alloc_workspace(int level)
+struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level)
{
+ const u32 blocksize = fs_info->sectorsize;
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
@@ -371,7 +383,7 @@ struct list_head *zstd_alloc_workspace(int level)
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf = kmalloc(blocksize, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
@@ -384,11 +396,13 @@ fail:
return ERR_PTR(-ENOMEM);
}
-int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
+int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
zstd_cstream *stream;
int ret = 0;
int nr_folios = 0;
@@ -399,7 +413,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long len = *total_out;
const unsigned long nr_dest_folios = *out_folios;
const u64 orig_end = start + len;
- unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const u32 blocksize = fs_info->sectorsize;
+ const u32 min_folio_size = btrfs_min_folio_size(fs_info);
+ unsigned long max_out = nr_dest_folios * min_folio_size;
unsigned int cur_len;
workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
@@ -411,9 +427,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
workspace->size);
if (unlikely(!stream)) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zstd compression init level %d failed, root %llu inode %llu offset %llu",
workspace->req_level, btrfs_root_id(inode->root),
btrfs_ino(inode), start);
@@ -431,7 +445,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->in_buf.size = cur_len;
/* Allocate and map in the output buffer */
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -439,7 +453,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
while (1) {
size_t ret2;
@@ -447,9 +461,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
if (unlikely(zstd_is_error(ret2))) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_warn(inode->root->fs_info,
+ btrfs_warn(fs_info,
"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -459,7 +471,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
}
/* Check to see if we are making it bigger */
- if (tot_in + workspace->in_buf.pos > 8192 &&
+ if (tot_in + workspace->in_buf.pos > blocksize * 2 &&
tot_in + workspace->in_buf.pos <
tot_out + workspace->out_buf.pos) {
ret = -E2BIG;
@@ -475,13 +487,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more output space */
if (workspace->out_buf.pos == workspace->out_buf.size) {
- tot_out += PAGE_SIZE;
- max_out -= PAGE_SIZE;
+ tot_out += min_folio_size;
+ max_out -= min_folio_size;
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -489,8 +501,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out,
- PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
}
/* We've reached the end of the input */
@@ -522,9 +533,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_end_stream(stream, &workspace->out_buf);
if (unlikely(zstd_is_error(ret2))) {
- struct btrfs_inode *inode = BTRFS_I(mapping->host);
-
- btrfs_err(inode->root->fs_info,
+ btrfs_err(fs_info,
"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
workspace->req_level, zstd_get_error_code(ret2),
btrfs_root_id(inode->root), btrfs_ino(inode),
@@ -542,13 +551,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
goto out;
}
- tot_out += PAGE_SIZE;
- max_out -= PAGE_SIZE;
+ tot_out += min_folio_size;
+ max_out -= min_folio_size;
if (nr_folios == nr_dest_folios) {
ret = -E2BIG;
goto out;
}
- out_folio = btrfs_alloc_compr_folio();
+ out_folio = btrfs_alloc_compr_folio(fs_info);
if (out_folio == NULL) {
ret = -ENOMEM;
goto out;
@@ -556,7 +565,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
folios[nr_folios++] = out_folio;
workspace->out_buf.dst = folio_address(out_folio);
workspace->out_buf.pos = 0;
- workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+ workspace->out_buf.size = min_t(size_t, max_out, min_folio_size);
}
if (tot_out >= tot_in) {
@@ -578,13 +587,16 @@ out:
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct folio **folios_in = cb->compressed_folios;
size_t srclen = cb->compressed_len;
zstd_dstream *stream;
int ret = 0;
+ const u32 blocksize = fs_info->sectorsize;
+ const unsigned int min_folio_size = btrfs_min_folio_size(fs_info);
unsigned long folio_in_index = 0;
- unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+ unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size);
unsigned long buf_start;
unsigned long total_out = 0;
@@ -602,11 +614,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
workspace->out_buf.dst = workspace->buf;
workspace->out_buf.pos = 0;
- workspace->out_buf.size = PAGE_SIZE;
+ workspace->out_buf.size = blocksize;
while (1) {
size_t ret2;
@@ -642,16 +654,16 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->in_buf.pos == workspace->in_buf.size) {
kunmap_local(workspace->in_buf.src);
folio_in_index++;
- if (folio_in_index >= total_folios_in) {
+ if (unlikely(folio_in_index >= total_folios_in)) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
- srclen -= PAGE_SIZE;
+ srclen -= min_folio_size;
workspace->in_buf.src =
kmap_local_folio(folios_in[folio_in_index], 0);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+ workspace->in_buf.size = min_t(size_t, srclen, min_folio_size);
}
}
ret = 0;
@@ -718,9 +730,7 @@ finish:
return ret;
}
-const struct btrfs_compress_op btrfs_zstd_compress = {
- /* ZSTD uses own workspace manager */
- .workspace_manager = NULL,
+const struct btrfs_compress_levels btrfs_zstd_compress = {
.min_level = ZSTD_BTRFS_MIN_LEVEL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 91dfd0231877..d1edb2ac3837 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -387,10 +387,9 @@ try_again:
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
struct renamedata rd = {
- .old_mnt_idmap = &nop_mnt_idmap,
+ .mnt_idmap = &nop_mnt_idmap,
.old_parent = dir,
.old_dentry = rep,
- .new_mnt_idmap = &nop_mnt_idmap,
.new_parent = cache->graveyard,
.new_dentry = grave,
};
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b202d789e93..322ed268f14a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
0,
gfp_flags);
if (IS_ERR(pages[index])) {
- if (PTR_ERR(pages[index]) == -EINVAL) {
+ int err = PTR_ERR(pages[index]);
+
+ if (err == -EINVAL) {
pr_err_client(cl, "inode->i_blkbits=%hhu\n",
inode->i_blkbits);
}
@@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
BUG_ON(ceph_wbc->locked_pages == 0);
pages[index] = NULL;
- return PTR_ERR(pages[index]);
+ return err;
}
} else {
pages[index] = &folio->page;
@@ -1687,6 +1689,7 @@ get_more_pages:
process_folio_batch:
rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
if (rc)
goto release_folios;
@@ -1695,8 +1698,6 @@ process_folio_batch:
goto release_folios;
if (ceph_wbc.processed_in_fbatch) {
- ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
-
if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
ceph_wbc.locked_pages < ceph_wbc.max_pages) {
doutc(cl, "reached end fbatch, trying for more\n");
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index cab722619207..7026e794813c 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
}
static struct fscrypt_operations ceph_fscrypt_ops = {
+ .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) -
+ (int)offsetof(struct ceph_inode_info, netfs.inode),
.needs_bounce_pages = 1,
.get_context = ceph_crypt_get_context,
.set_context = ceph_crypt_set_context,
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen = 0;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8478e7e75df6..32973c62c1a2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
@@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
}
out:
iput(req->r_old_inode);
@@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
int err = -EROFS;
int op;
char *path;
- int pathlen;
- u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c02f100f8552..978acd3d4b32 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
int mask = MAY_READ;
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
pr_warn_client(cl,
"async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
@@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
char *path;
- int pathlen;
- u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
@@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= MAY_WRITE;
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fc543075b827..949f0badc944 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
#ifdef CONFIG_FS_ENCRYPTION
+ ci->i_crypt_info = NULL;
ci->fscrypt_auth = NULL;
ci->fscrypt_auth_len = 0;
#endif
@@ -1523,6 +1570,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1584,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
- if (dir) {
- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
- struct ceph_fname fname = { .dir = dir,
+ struct ceph_fname fname = { .dir = parent_dir,
.name = rinfo->dname,
.ctext = rinfo->altname,
.name_len = rinfo->dname_len,
@@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
- parent = d_find_any_alias(dir);
+ parent = d_find_any_alias(parent_dir);
BUG_ON(!parent);
- err = ceph_fname_alloc_buffer(dir, &oname);
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
if (err < 0) {
dput(parent);
goto done;
@@ -1576,7 +1631,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
if (err < 0) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
goto done;
}
dname.name = oname.name;
@@ -1595,7 +1650,7 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
err = -ENOMEM;
goto done;
}
@@ -1610,12 +1665,12 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
doutc(cl, " dn %p points to wrong inode %p\n",
dn, d_inode(dn));
- ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(parent_dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
}
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1794,6 +1849,9 @@ retry_lookup:
&dvino, ptvino);
}
done:
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
doutc(cl, "done err=%d\n", err);
return err;
}
@@ -2487,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0f497c39ff82..73da2648fa0f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
int count;
dput(dentry);
d_prune_aliases(inode);
- count = atomic_read(&inode->i_count);
+ count = icount_read(inode);
if (count == 1)
(*remaining)--;
doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
@@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* ceph_mdsc_build_path - build a path string to a given dentry
* @mdsc: mds client
* @dentry: dentry to which path should be built
- * @plen: returned length of string
- * @pbase: returned base inode number
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
* @for_wire: is this path going to be sent to the MDS?
*
* Build a string that represents the path to the dentry. This is mostly called
@@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- int *plen, u64 *pbase, int for_wire)
+ struct ceph_path_info *path_info, int for_wire)
{
struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
@@ -2810,16 +2809,28 @@ retry:
return ERR_PTR(-ENAMETOOLONG);
}
- *pbase = base;
- *plen = PATH_MAX - 1 - pos;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
- base, *plen, path + pos);
+ base, PATH_MAX - 1 - pos, path + pos);
return path + pos;
}
static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- struct inode *dir, const char **ppath, int *ppathlen,
- u64 *pino, bool *pfreepath, bool parent_locked)
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
@@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
dir = d_inode_rcu(dentry->d_parent);
if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
!IS_ENCRYPTED(dir)) {
- *pino = ceph_ino(dir);
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode,
*/
static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
struct dentry *rdentry, struct inode *rdiri,
- const char *rpath, u64 rino, const char **ppath,
- int *pathlen, u64 *ino, bool *freepath,
+ const char *rpath, u64 rino,
+ struct ceph_path_info *path_info,
bool parent_locked)
{
struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ r = build_inode_path(rinode, path_info);
doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
- freepath, parent_locked);
- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- doutc(cl, " path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
@@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- bool freepath1 = false, freepath2 = false;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
struct dentry *old_dentry = NULL;
int len;
u16 releases;
@@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u16 request_head_version = mds_supported_head_version(session);
kuid_t caller_fsuid = req->r_cred->fsuid;
kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
- req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1,
- test_bit(CEPH_MDS_R_PARENT_LOCKED,
- &req->r_req_flags));
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+ ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *old_parent = req->r_parent;
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(old_parent), path_info1.vino.ino);
+ /*
+ * Transfer CEPH_CAP_PIN from the old parent to the new one.
+ * The pin was taken earlier in ceph_mdsc_submit_request().
+ */
+ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+ iput(old_parent);
+ req->r_parent = correct_dir;
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ }
+ }
+
/* If r_old_dentry is set, then assume that its parent is locked */
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
ret = set_request_path_attr(mdsc, NULL, old_dentry,
- req->r_old_dentry_dir,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2, true);
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
/* filepaths */
len += 2 * (1 + sizeof(u32) + sizeof(u64));
- len += pathlen1 + pathlen2;
+ len += path_info1.pathlen + path_info2.pathlen;
/* cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
if (req->r_dentry_drop)
- len += pathlen1;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += pathlen2;
+ len += path_info2.pathlen;
/* MClientRequest tail */
@@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- ceph_mdsc_free_path((char *)path2, pathlen2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- ceph_mdsc_free_path((char *)path1, pathlen1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
out_err:
@@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
struct ceph_cap *cap;
- char *path;
- int pathlen = 0, err;
- u64 pathbase;
+ struct ceph_path_info path_info = {0};
+ int err;
u64 snap_follows;
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
- } else {
- path = NULL;
- pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
@@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ts = inode_get_atime(inode);
ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -4706,7 +4744,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -4730,7 +4768,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -4741,17 +4779,17 @@ out_freeflocks:
} else {
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
- pathlen + sizeof(rec.v1));
+ path_info.pathlen + sizeof(rec.v1));
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
out_err:
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
if (!err)
recon_state->nr_caps++;
return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
{
- if (!IS_ERR_OR_NULL(path))
- __putname(path - (PATH_MAX - 1 - len));
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
}
extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
- struct dentry *dentry, int *plen, u64 *base,
+ struct dentry *dentry, struct ceph_path_info *path_info,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c3eb651862c5..db6c2db68f96 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
if (!fsc->inode_wq)
goto fail_client;
- fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1);
if (!fsc->cap_wq)
goto fail_inode_wq;
@@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.free_inode = ceph_free_inode,
.write_inode = ceph_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cf176aab0f82..25d8bacbcf44 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -463,6 +463,7 @@ struct ceph_inode_info {
unsigned long i_work_mask;
#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
u32 fscrypt_auth_len;
u32 fscrypt_file_len;
u8 *fscrypt_auth;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 740f18b60c9d..456c4a2efb53 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode)
static const struct super_operations configfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.free_inode = configfs_free_inode,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index 5dce257c67fc..0d9a5d07a75d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
/*
* Usermode helpers are childen of either
- * system_unbound_wq or of kthreadd. So we know that
+ * system_dfl_wq or of kthreadd. So we know that
* we're starting off with a clean file descriptor
* table. So we should always be able to use
* COREDUMP_PIDFD_NUMBER as our file descriptor value.
@@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
ssize_t retval;
char old_core_pattern[CORENAME_MAX_SIZE];
+ if (write)
+ return proc_dostring(table, write, buffer, lenp, ppos);
+
retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
error = proc_dostring(table, write, buffer, lenp, ppos);
if (error)
return error;
+
if (!check_coredump_socket()) {
strscpy(core_pattern, old_core_pattern, retval + 1);
return -EINVAL;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b002e9b734f9..12daa85ed941 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
- default:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
init_special_inode(inode, cramfs_inode->mode,
old_decode_dev(cramfs_inode->size));
+ break;
+ default:
+ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
}
inode->i_mode = cramfs_inode->mode;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index b5dfb0aa405a..464b54610fd3 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -2,10 +2,9 @@
config FS_ENCRYPTION
bool "FS Encryption (Per-file encryption)"
select CRYPTO
- select CRYPTO_HASH
- select CRYPTO_HKDF
select CRYPTO_SKCIPHER
select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_SHA512
select KEYS
help
Enable encryption of files and directories. This
@@ -32,8 +31,6 @@ config FS_ENCRYPTION_ALGS
select CRYPTO_CBC
select CRYPTO_CTS
select CRYPTO_ECB
- select CRYPTO_HMAC
- select CRYPTO_SHA512
select CRYPTO_XTS
config FS_ENCRYPTION_INLINE_CRYPT
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 486fcb2ecf13..5f5599020e94 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -113,7 +113,7 @@ out:
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
@@ -148,7 +148,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
*/
for (i = 0; i < nr_pages; i++) {
pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS :
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
if (!pages[i])
break;
}
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index b6ccab524fde..07f9cbfe3ea4 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
size_t len, size_t offs, gfp_t gfp_flags)
{
const struct inode *inode = folio->mapping->host;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
@@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
- return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
- lblk_num, page, page, len, offs);
+ return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+ FS_ENCRYPT, lblk_num, page, page, len,
+ offs);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
@@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
size_t offs)
{
const struct inode *inode = folio->mapping->host;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
@@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
- return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
- lblk_num, page, page, len, offs);
+ return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+ FS_DECRYPT, lblk_num, page, page, len,
+ offs);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index f9f6713e144f..8e4c213d418b 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -11,7 +11,6 @@
* This has not yet undergone a rigorous security audit.
*/
-#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <crypto/skcipher.h>
#include <linux/export.h>
@@ -94,7 +93,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
@@ -138,7 +137,7 @@ static int fname_decrypt(const struct inode *inode,
const struct fscrypt_str *iname,
struct fscrypt_str *oname)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
@@ -274,8 +273,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
u32 max_len, u32 *encrypted_len_ret)
{
- return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
- orig_len, max_len,
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
+
+ return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len,
encrypted_len_ret);
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
@@ -543,7 +543,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
*/
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
{
- const struct fscrypt_inode_info *ci = dir->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir);
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d8b485b9881c..4e8e82a9ccf9 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,10 +11,10 @@
#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H
+#include <crypto/sha2.h>
#include <linux/fscrypt.h>
#include <linux/minmax.h>
#include <linux/siphash.h>
-#include <crypto/hash.h>
#include <linux/blk-crypto.h>
#define CONST_STRLEN(str) (sizeof(str) - 1)
@@ -249,8 +249,8 @@ struct fscrypt_prepared_key {
* fscrypt_inode_info - the "encryption key" for an inode
*
* When an encrypted file's key is made available, an instance of this struct is
- * allocated and stored in ->i_crypt_info. Once created, it remains until the
- * inode is evicted.
+ * allocated and a pointer to it is stored in the file's in-memory inode. Once
+ * created, it remains until the inode is evicted.
*/
struct fscrypt_inode_info {
@@ -381,12 +381,8 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
u32 *encrypted_len_ret);
/* hkdf.c */
-struct fscrypt_hkdf {
- struct crypto_shash *hmac_tfm;
-};
-
-int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
- unsigned int master_key_size);
+void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
+ unsigned int master_key_size);
/*
* The list of contexts in which fscrypt uses HKDF. These values are used as
@@ -405,11 +401,9 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \
8 /* info=<empty> */
-int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
- const u8 *info, unsigned int infolen,
- u8 *okm, unsigned int okmlen);
-
-void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
+void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen);
/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -517,7 +511,7 @@ struct fscrypt_master_key_secret {
* ->is_hw_wrapped=false, or by the "software secret" that hardware
* derived from this master key if ->is_hw_wrapped=true.
*/
- struct fscrypt_hkdf hkdf;
+ struct hmac_sha512_key hkdf;
/*
* True if this key is a hardware-wrapped key; false if this key is a
@@ -696,7 +690,7 @@ struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
-int fscrypt_get_test_dummy_key_identifier(
+void fscrypt_get_test_dummy_key_identifier(
u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
int fscrypt_add_test_dummy_key(struct super_block *sb,
@@ -732,8 +726,8 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
const u8 *raw_key);
-int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
- const struct fscrypt_master_key *mk);
+void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
+ const struct fscrypt_master_key *mk);
void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk);
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index b1ef506cd341..706f56d0076e 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -1,5 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
+ * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
+ * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010):
+ * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
+ *
* This is used to derive keys from the fscrypt master keys (or from the
* "software secrets" which hardware derives from the fscrypt master keys, in
* the case that the fscrypt master keys are hardware-wrapped keys).
@@ -7,10 +11,6 @@
* Copyright 2019 Google LLC
*/
-#include <crypto/hash.h>
-#include <crypto/hkdf.h>
-#include <crypto/sha2.h>
-
#include "fscrypt_private.h"
/*
@@ -24,7 +24,6 @@
* HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of
* SHA-512 causes HKDF-Expand to only need to do one iteration rather than two.
*/
-#define HKDF_HMAC_ALG "hmac(sha512)"
#define HKDF_HASHLEN SHA512_DIGEST_SIZE
/*
@@ -44,54 +43,24 @@
*/
/*
- * Compute HKDF-Extract using the given master key as the input keying material,
- * and prepare an HMAC transform object keyed by the resulting pseudorandom key.
- *
- * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many
- * times without having to recompute HKDF-Extract each time.
+ * Compute HKDF-Extract using 'master_key' as the input keying material, and
+ * prepare the resulting HMAC key in 'hkdf'. Afterwards, 'hkdf' can be used for
+ * HKDF-Expand many times without having to recompute HKDF-Extract each time.
*/
-int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
- unsigned int master_key_size)
+void fscrypt_init_hkdf(struct hmac_sha512_key *hkdf, const u8 *master_key,
+ unsigned int master_key_size)
{
- struct crypto_shash *hmac_tfm;
static const u8 default_salt[HKDF_HASHLEN];
u8 prk[HKDF_HASHLEN];
- int err;
-
- hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK);
- if (IS_ERR(hmac_tfm)) {
- fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
- PTR_ERR(hmac_tfm));
- return PTR_ERR(hmac_tfm);
- }
-
- if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) {
- err = -EINVAL;
- goto err_free_tfm;
- }
-
- err = hkdf_extract(hmac_tfm, master_key, master_key_size,
- default_salt, HKDF_HASHLEN, prk);
- if (err)
- goto err_free_tfm;
-
- err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk));
- if (err)
- goto err_free_tfm;
- hkdf->hmac_tfm = hmac_tfm;
- goto out;
-
-err_free_tfm:
- crypto_free_shash(hmac_tfm);
-out:
+ hmac_sha512_usingrawkey(default_salt, sizeof(default_salt),
+ master_key, master_key_size, prk);
+ hmac_sha512_preparekey(hkdf, prk, sizeof(prk));
memzero_explicit(prk, sizeof(prk));
- return err;
}
/*
- * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which
- * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen'
+ * HKDF-Expand (RFC 5869 section 2.3). Expand the HMAC key 'hkdf' into 'okmlen'
* bytes of output keying material parameterized by the application-specific
* 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context'
* byte. This is thread-safe and may be called by multiple threads in parallel.
@@ -100,30 +69,32 @@ out:
* adds to its application-specific info strings to guarantee that it doesn't
* accidentally repeat an info string when using HKDF for different purposes.)
*/
-int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
- const u8 *info, unsigned int infolen,
- u8 *okm, unsigned int okmlen)
-{
- SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
- u8 *full_info;
- int err;
-
- full_info = kzalloc(infolen + 9, GFP_KERNEL);
- if (!full_info)
- return -ENOMEM;
- desc->tfm = hkdf->hmac_tfm;
-
- memcpy(full_info, "fscrypt\0", 8);
- full_info[8] = context;
- memcpy(full_info + 9, info, infolen);
-
- err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9,
- okm, okmlen);
- kfree_sensitive(full_info);
- return err;
-}
-
-void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf)
+void fscrypt_hkdf_expand(const struct hmac_sha512_key *hkdf, u8 context,
+ const u8 *info, unsigned int infolen,
+ u8 *okm, unsigned int okmlen)
{
- crypto_free_shash(hkdf->hmac_tfm);
+ struct hmac_sha512_ctx ctx;
+ u8 counter = 1;
+ u8 tmp[HKDF_HASHLEN];
+
+ WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN);
+
+ for (unsigned int i = 0; i < okmlen; i += HKDF_HASHLEN) {
+ hmac_sha512_init(&ctx, hkdf);
+ if (i != 0)
+ hmac_sha512_update(&ctx, &okm[i - HKDF_HASHLEN],
+ HKDF_HASHLEN);
+ hmac_sha512_update(&ctx, "fscrypt\0", 8);
+ hmac_sha512_update(&ctx, &context, 1);
+ hmac_sha512_update(&ctx, info, infolen);
+ hmac_sha512_update(&ctx, &counter, 1);
+ if (okmlen - i < HKDF_HASHLEN) {
+ hmac_sha512_final(&ctx, tmp);
+ memcpy(&okm[i], tmp, okmlen - i);
+ memzero_explicit(tmp, sizeof(tmp));
+ } else {
+ hmac_sha512_final(&ctx, &okm[i]);
+ }
+ counter++;
+ }
}
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index e0b32ac841f7..b97de0d1430f 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -199,13 +199,13 @@ int fscrypt_prepare_setflags(struct inode *inode,
err = fscrypt_require_key(inode);
if (err)
return err;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
mk = ci->ci_master_key;
down_read(&mk->mk_sem);
if (mk->mk_present)
- err = fscrypt_derive_dirhash_key(ci, mk);
+ fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
up_read(&mk->mk_sem);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index caaff809765b..5dee7c498bc8 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb,
bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
- return inode->i_crypt_info->ci_inlinecrypt;
+ return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt;
}
EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
@@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
if (!fscrypt_inode_uses_inline_crypto(inode))
return;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
fscrypt_generate_dun(ci, first_lblk, dun);
bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
@@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
u64 next_lblk)
{
const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ const struct fscrypt_inode_info *ci;
u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
return false;
if (!bc)
return true;
+ ci = fscrypt_get_inode_info_raw(inode);
/*
* Comparing the key pointers is good enough, as all I/O for each key
* uses the same pointer. I.e., there's currently no need to support
* merging requests where the keys are the same but the pointers differ.
*/
- if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
+ if (bc->bc_key != ci->ci_enc_key.blk_key)
return false;
- fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
+ fscrypt_generate_dun(ci, next_lblk, next_dun);
return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
@@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
if (nr_blocks <= 1)
return nr_blocks;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
if (!(fscrypt_policy_flags(&ci->ci_policy) &
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
return nr_blocks;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7557f6a88b8f..3adbd7167055 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -42,7 +42,6 @@ struct fscrypt_keyring {
static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
{
- fscrypt_destroy_hkdf(&secret->hkdf);
memzero_explicit(secret, sizeof(*secret));
}
@@ -587,21 +586,17 @@ static int add_master_key(struct super_block *sb,
keyid_kdf_ctx =
HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY;
}
- err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size);
+ fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size);
/*
* Now that the KDF context is initialized, the raw KDF key is
* no longer needed.
*/
memzero_explicit(kdf_key, kdf_key_size);
- if (err)
- return err;
/* Calculate the key identifier */
- err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0,
- key_spec->u.identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- if (err)
- return err;
+ fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0,
+ key_spec->u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
}
return do_add_master_key(sb, secret, key_spec);
}
@@ -835,24 +830,17 @@ fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
memcpy(secret->bytes, test_key, sizeof(test_key));
}
-int fscrypt_get_test_dummy_key_identifier(
+void fscrypt_get_test_dummy_key_identifier(
u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
{
struct fscrypt_master_key_secret secret;
- int err;
fscrypt_get_test_dummy_secret(&secret);
-
- err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size);
- if (err)
- goto out;
- err = fscrypt_hkdf_expand(&secret.hkdf,
- HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY,
- NULL, 0, key_identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
-out:
+ fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size);
+ fscrypt_hkdf_expand(&secret.hkdf,
+ HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0,
+ key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
wipe_master_key_secret(&secret);
- return err;
}
/**
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 4f3b9ecbfe4e..4bd3918f50e3 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -253,11 +253,8 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
sizeof(sb->s_uuid));
hkdf_infolen += sizeof(sb->s_uuid);
}
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- hkdf_context, hkdf_info, hkdf_infolen,
- mode_key, mode->keysize);
- if (err)
- goto out_unlock;
+ fscrypt_hkdf_expand(&mk->mk_secret.hkdf, hkdf_context, hkdf_info,
+ hkdf_infolen, mode_key, mode->keysize);
err = fscrypt_prepare_key(prep_key, mode_key, ci);
memzero_explicit(mode_key, mode->keysize);
if (err)
@@ -278,36 +275,25 @@ out_unlock:
* as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an
* endianness swap in order to get the same results as on little endian CPUs.
*/
-static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
- u8 context, const u8 *info,
- unsigned int infolen, siphash_key_t *key)
+static void fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
+ u8 context, const u8 *info,
+ unsigned int infolen, siphash_key_t *key)
{
- int err;
-
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
- (u8 *)key, sizeof(*key));
- if (err)
- return err;
-
+ fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
+ (u8 *)key, sizeof(*key));
BUILD_BUG_ON(sizeof(*key) != 16);
BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2);
le64_to_cpus(&key->key[0]);
le64_to_cpus(&key->key[1]);
- return 0;
}
-int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
- const struct fscrypt_master_key *mk)
+void fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
+ const struct fscrypt_master_key *mk)
{
- int err;
-
- err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
- ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
- &ci->ci_dirhash_key);
- if (err)
- return err;
+ fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+ &ci->ci_dirhash_key);
ci->ci_dirhash_key_initialized = true;
- return 0;
}
void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
@@ -338,17 +324,12 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
if (mk->mk_ino_hash_key_initialized)
goto unlock;
- err = fscrypt_derive_siphash_key(mk,
- HKDF_CONTEXT_INODE_HASH_KEY,
- NULL, 0, &mk->mk_ino_hash_key);
- if (err)
- goto unlock;
+ fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_INODE_HASH_KEY,
+ NULL, 0, &mk->mk_ino_hash_key);
/* pairs with smp_load_acquire() above */
smp_store_release(&mk->mk_ino_hash_key_initialized, true);
unlock:
mutex_unlock(&fscrypt_mode_key_setup_mutex);
- if (err)
- return err;
}
/*
@@ -402,13 +383,10 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
} else {
u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE];
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- HKDF_CONTEXT_PER_FILE_ENC_KEY,
- ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
- derived_key, ci->ci_mode->keysize);
- if (err)
- return err;
-
+ fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+ HKDF_CONTEXT_PER_FILE_ENC_KEY,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+ derived_key, ci->ci_mode->keysize);
err = fscrypt_set_per_file_enc_key(ci, derived_key);
memzero_explicit(derived_key, ci->ci_mode->keysize);
}
@@ -416,11 +394,8 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
return err;
/* Derive a secret dirhash key for directories that need it. */
- if (need_dirhash_key) {
- err = fscrypt_derive_dirhash_key(ci, mk);
- if (err)
- return err;
- }
+ if (need_dirhash_key)
+ fscrypt_derive_dirhash_key(ci, mk);
return 0;
}
@@ -642,15 +617,16 @@ fscrypt_setup_encryption_info(struct inode *inode,
goto out;
/*
- * For existing inodes, multiple tasks may race to set ->i_crypt_info.
- * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with
- * a RELEASE barrier so that other tasks can ACQUIRE it.
+ * For existing inodes, multiple tasks may race to set the inode's
+ * fscrypt info pointer. So use cmpxchg_release(). This pairs with the
+ * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the
+ * pointer with a RELEASE barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+ if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) ==
+ NULL) {
/*
- * We won the race and set ->i_crypt_info to our crypt_info.
- * Now link it into the master key's inode list.
+ * We won the race and set the inode's fscrypt info to our
+ * crypt_info. Now link it into the master key's inode list.
*/
if (mk) {
crypt_info->ci_master_key = mk;
@@ -681,13 +657,13 @@ out:
* %false unless the operation being performed is needed in
* order for files (or directories) to be deleted.
*
- * Set up ->i_crypt_info, if it hasn't already been done.
+ * Set up the inode's encryption key, if it hasn't already been done.
*
- * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So
* generally this shouldn't be called from within a filesystem transaction.
*
- * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
- * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * Return: 0 if the key is now set up, *or* if it couldn't be set up because the
+ * needed master key is absent. (Use fscrypt_has_encryption_key() to
* distinguish these cases.) Also can return another -errno code.
*/
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
@@ -741,9 +717,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
* ->i_ino doesn't need to be set yet.
* @encrypt_ret: (output) set to %true if the new inode will be encrypted
*
- * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * If the directory is encrypted, set up its encryption key in preparation for
* encrypting the name of the new file. Also, if the new inode will be
- * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ * encrypted, set up its encryption key too and set *encrypt_ret=true.
*
* This isn't %GFP_NOFS-safe, and therefore it should be called before starting
* any filesystem transaction to create the inode. For this reason, ->i_ino
@@ -752,8 +728,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
* This doesn't persist the new inode's encryption context. That still needs to
* be done later by calling fscrypt_set_context().
*
- * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
- * -errno code
+ * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode
+ * but the needed master key is absent, or another -errno code
*/
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
bool *encrypt_ret)
@@ -800,8 +776,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
*/
void fscrypt_put_encryption_info(struct inode *inode)
{
- put_crypt_info(inode->i_crypt_info);
- inode->i_crypt_info = NULL;
+ /*
+ * Ideally we'd start with a lightweight IS_ENCRYPTED() check here
+ * before proceeding to retrieve and check the pointer. However, during
+ * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If
+ * an error occurs, it needs to be cleaned up regardless.
+ */
+ struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode);
+
+ put_crypt_info(*ci_addr);
+ *ci_addr = NULL;
}
EXPORT_SYMBOL(fscrypt_put_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6ad30ae07c06..bbb2f5ced988 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
err = fscrypt_require_key(dir);
if (err)
return ERR_PTR(err);
- return &dir->i_crypt_info->ci_policy;
+ return &fscrypt_get_inode_info_raw(dir)->ci_policy;
}
return fscrypt_get_dummy_policy(dir->i_sb);
@@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
*/
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
{
- struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
BUILD_BUG_ON(sizeof(union fscrypt_context) !=
FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
*/
int fscrypt_set_context(struct inode *inode, void *fs_data)
{
- struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci;
union fscrypt_context ctx;
int ctxsize;
@@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
* This may be the first time the inode number is available, so do any
* delayed key setup that requires the inode number.
*/
+ ci = fscrypt_get_inode_info_raw(inode);
if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
(ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
fscrypt_hash_inode_number(ci, ci->ci_master_key);
@@ -826,10 +827,8 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
policy->version = FSCRYPT_POLICY_V2;
policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- err = fscrypt_get_test_dummy_key_identifier(
+ fscrypt_get_test_dummy_key_identifier(
policy->v2.master_key_identifier);
- if (err)
- goto out;
} else {
err = -EINVAL;
goto out;
diff --git a/fs/dcache.c b/fs/dcache.c
index 60046ae23d51..65cc11939654 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir)
{
preempt_disable_nested();
for (;;) {
- unsigned n = dir->i_dir_seq;
- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ unsigned n = READ_ONCE(dir->i_dir_seq);
+ if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1))
return n;
cpu_relax();
}
@@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
write_sequnlock(&rename_lock);
}
+EXPORT_SYMBOL(d_exchange);
/**
* d_ancestor - search for an ancestor
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c12d649df6a5..661a99a7dfbe 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -362,7 +362,8 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
}
EXPORT_SYMBOL_GPL(debugfs_lookup);
-static struct dentry *start_creating(const char *name, struct dentry *parent)
+static struct dentry *debugfs_start_creating(const char *name,
+ struct dentry *parent)
{
struct dentry *dentry;
int error;
@@ -428,7 +429,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
- dentry = start_creating(name, parent);
+ dentry = debugfs_start_creating(name, parent);
if (IS_ERR(dentry))
return dentry;
@@ -577,7 +578,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
- struct dentry *dentry = start_creating(name, parent);
+ struct dentry *dentry = debugfs_start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
@@ -624,7 +625,7 @@ struct dentry *debugfs_create_automount(const char *name,
debugfs_automount_t f,
void *data)
{
- struct dentry *dentry = start_creating(name, parent);
+ struct dentry *dentry = debugfs_start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
@@ -687,7 +688,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
if (!link)
return ERR_PTR(-ENOMEM);
- dentry = start_creating(name, parent);
+ dentry = debugfs_start_creating(name, parent);
if (IS_ERR(dentry)) {
kfree(link);
return dentry;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index a23fd524a6ee..a0d75b5c83c6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -26,6 +26,7 @@
/*
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>)
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/release_recover
* /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>)
* /config/dlm/<cluster>/comms/<comm>/local
* /config/dlm/<cluster>/comms/<comm>/addr (write only)
@@ -267,6 +268,7 @@ enum {
enum {
NODE_ATTR_NODEID = 0,
NODE_ATTR_WEIGHT,
+ NODE_ATTR_RELEASE_RECOVER,
};
struct dlm_clusters {
@@ -280,6 +282,8 @@ struct dlm_spaces {
struct dlm_space {
struct config_group group;
struct list_head members;
+ struct list_head members_gone;
+ int members_gone_count;
struct mutex members_lock;
int members_count;
struct dlm_nodes *nds;
@@ -310,6 +314,14 @@ struct dlm_node {
int weight;
int new;
int comm_seq; /* copy of cm->seq when nd->nodeid is set */
+ unsigned int release_recover;
+};
+
+struct dlm_member_gone {
+ int nodeid;
+ unsigned int release_recover;
+
+ struct list_head list; /* space->members_gone */
};
static struct configfs_group_operations clusters_ops = {
@@ -480,6 +492,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
configfs_add_default_group(&nds->ns_group, &sp->group);
INIT_LIST_HEAD(&sp->members);
+ INIT_LIST_HEAD(&sp->members_gone);
mutex_init(&sp->members_lock);
sp->members_count = 0;
sp->nds = nds;
@@ -587,10 +600,20 @@ static void drop_node(struct config_group *g, struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
struct dlm_node *nd = config_item_to_node(i);
+ struct dlm_member_gone *mb_gone;
+
+ mb_gone = kzalloc(sizeof(*mb_gone), GFP_KERNEL);
+ if (!mb_gone)
+ return;
mutex_lock(&sp->members_lock);
list_del(&nd->list);
sp->members_count--;
+
+ mb_gone->nodeid = nd->nodeid;
+ mb_gone->release_recover = nd->release_recover;
+ list_add(&mb_gone->list, &sp->members_gone);
+ sp->members_gone_count++;
mutex_unlock(&sp->members_lock);
config_item_put(i);
@@ -815,12 +838,34 @@ static ssize_t node_weight_store(struct config_item *item, const char *buf,
return len;
}
+static ssize_t node_release_recover_show(struct config_item *item, char *buf)
+{
+ struct dlm_node *n = config_item_to_node(item);
+
+ return sprintf(buf, "%u\n", n->release_recover);
+}
+
+static ssize_t node_release_recover_store(struct config_item *item,
+ const char *buf, size_t len)
+{
+ struct dlm_node *n = config_item_to_node(item);
+ int rc;
+
+ rc = kstrtouint(buf, 0, &n->release_recover);
+ if (rc)
+ return rc;
+
+ return len;
+}
+
CONFIGFS_ATTR(node_, nodeid);
CONFIGFS_ATTR(node_, weight);
+CONFIGFS_ATTR(node_, release_recover);
static struct configfs_attribute *node_attrs[] = {
[NODE_ATTR_NODEID] = &node_attr_nodeid,
[NODE_ATTR_WEIGHT] = &node_attr_weight,
+ [NODE_ATTR_RELEASE_RECOVER] = &node_attr_release_recover,
NULL,
};
@@ -882,9 +927,10 @@ static void put_comm(struct dlm_comm *cm)
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out)
{
+ struct dlm_member_gone *mb_gone, *mb_safe;
+ struct dlm_config_node *nodes, *node;
struct dlm_space *sp;
struct dlm_node *nd;
- struct dlm_config_node *nodes, *node;
int rv, count;
sp = get_space(lsname);
@@ -898,7 +944,7 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
goto out;
}
- count = sp->members_count;
+ count = sp->members_count + sp->members_gone_count;
nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
if (!nodes) {
@@ -917,6 +963,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
nd->new = 0;
}
+ /* we delay the remove on nodes until here as configfs does
+ * not support addtional attributes for rmdir().
+ */
+ list_for_each_entry_safe(mb_gone, mb_safe, &sp->members_gone, list) {
+ node->nodeid = mb_gone->nodeid;
+ node->release_recover = mb_gone->release_recover;
+ node->gone = true;
+ node++;
+
+ list_del(&mb_gone->list);
+ sp->members_gone_count--;
+ kfree(mb_gone);
+ }
+
*count_out = count;
*nodes_out = nodes;
rv = 0;
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 13a3d0b26194..4ebd45f75276 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -17,8 +17,10 @@
struct dlm_config_node {
int nodeid;
int weight;
+ bool gone;
int new;
uint32_t comm_seq;
+ unsigned int release_recover;
};
extern const struct rhashtable_params dlm_rhash_rsb_params;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6dd3a524cd35..be938fdf17d9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5576,7 +5576,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) {
/* We may need to adjust grmode depending on other granted locks. */
- log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x",
+ log_rinfo(ls, "%s %x middle convert gr %d rq %d remote %d %x",
__func__, lkb->lkb_id, lkb->lkb_grmode,
lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid);
rsb_set_flag(r, RSB_RECOVER_CONVERT);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1929327ffbe1..ddaa76558706 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -186,12 +186,17 @@ static struct kobj_type dlm_ktype = {
static struct kset *dlm_kset;
-static int do_uevent(struct dlm_ls *ls, int in)
+static int do_uevent(struct dlm_ls *ls, int in, unsigned int release_recover)
{
- if (in)
+ char message[512] = {};
+ char *envp[] = { message, NULL };
+
+ if (in) {
kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
- else
- kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+ } else {
+ snprintf(message, 511, "RELEASE_RECOVER=%u", release_recover);
+ kobject_uevent_env(&ls->ls_kobj, KOBJ_OFFLINE, envp);
+ }
log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
@@ -575,7 +580,7 @@ static int new_lockspace(const char *name, const char *cluster,
current lockspace members are (via configfs) and then tells the
lockspace to start running (via sysfs) in dlm_ls_start(). */
- error = do_uevent(ls, 1);
+ error = do_uevent(ls, 1, 0);
if (error < 0)
goto out_recoverd;
@@ -592,7 +597,7 @@ static int new_lockspace(const char *name, const char *cluster,
return 0;
out_members:
- do_uevent(ls, 0);
+ do_uevent(ls, 0, 0);
dlm_clear_members(ls);
kfree(ls->ls_node_array);
out_recoverd:
@@ -671,19 +676,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster,
This is because there may be LKBs queued as ASTs that have been unlinked
from their RSBs and are pending deletion once the AST has been delivered */
-static int lockspace_busy(struct dlm_ls *ls, int force)
+static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option)
{
struct dlm_lkb *lkb;
unsigned long id;
int rv = 0;
read_lock_bh(&ls->ls_lkbxa_lock);
- if (force == 0) {
+ if (release_option == DLM_RELEASE_NO_LOCKS) {
xa_for_each(&ls->ls_lkbxa, id, lkb) {
rv = 1;
break;
}
- } else if (force == 1) {
+ } else if (release_option == DLM_RELEASE_UNUSED) {
+ /* TODO: handle this UNUSED option as NO_LOCKS in later patch */
xa_for_each(&ls->ls_lkbxa, id, lkb) {
if (lkb->lkb_nodeid == 0 &&
lkb->lkb_grmode != DLM_LOCK_IV) {
@@ -698,11 +704,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
return rv;
}
-static int release_lockspace(struct dlm_ls *ls, int force)
+static int release_lockspace(struct dlm_ls *ls, unsigned int release_option)
{
int busy, rv;
- busy = lockspace_busy(ls, force);
+ busy = lockspace_busy(ls, release_option);
spin_lock_bh(&lslist_lock);
if (ls->ls_create_count == 1) {
@@ -730,8 +736,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_device_deregister(ls);
- if (force < 3 && dlm_user_daemon_available())
- do_uevent(ls, 0);
+ if (release_option != DLM_RELEASE_NO_EVENT &&
+ dlm_user_daemon_available())
+ do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER));
dlm_recoverd_stop(ls);
@@ -782,25 +789,24 @@ static int release_lockspace(struct dlm_ls *ls, int force)
* lockspace must continue to function as usual, participating in recoveries,
* until this returns.
*
- * Force has 4 possible values:
- * 0 - don't destroy lockspace if it has any LKBs
- * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
- * 2 - destroy lockspace regardless of LKBs
- * 3 - destroy lockspace as part of a forced shutdown
+ * See DLM_RELEASE defines for release_option values and their meaning.
*/
-int dlm_release_lockspace(void *lockspace, int force)
+int dlm_release_lockspace(void *lockspace, unsigned int release_option)
{
struct dlm_ls *ls;
int error;
+ if (release_option > __DLM_RELEASE_MAX)
+ return -EINVAL;
+
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
dlm_put_lockspace(ls);
mutex_lock(&ls_lock);
- error = release_lockspace(ls, force);
+ error = release_lockspace(ls, release_option);
if (!error)
ls_count--;
if (!ls_count)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e4373bce1bc2..9a0b6c2b6b01 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1703,7 +1703,7 @@ static int work_start(void)
return -ENOMEM;
}
- process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0);
+ process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0);
if (!process_workqueue) {
log_print("can't start dlm_process");
destroy_workqueue(io_workqueue);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 4887c8a05318..a44d16da7187 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -52,7 +52,7 @@ static int __init init_dlm(void)
if (error)
goto out_user;
- dlm_wq = alloc_workqueue("dlm_wq", 0, 0);
+ dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0);
if (!dlm_wq) {
error = -ENOMEM;
goto out_plock;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b0864c93230f..c0f557a80a75 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls)
ls->ls_ops->recover_prep(ls->ls_ops_arg);
}
-static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb,
+ unsigned int release_recover)
{
struct dlm_slot slot;
uint32_t seq;
@@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
error = dlm_comm_seq(memb->nodeid, &seq, false);
- if (!error && seq == memb->comm_seq)
+ if (!release_recover && !error && seq == memb->comm_seq)
return;
slot.nodeid = memb->nodeid;
@@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
struct dlm_member *memb, *safe;
struct dlm_config_node *node;
int i, error, neg = 0, low = -1;
+ unsigned int release_recover;
/* previously removed members that we've not finished removing need to
* count as a negative change so the "neg" recovery steps will happen
@@ -569,11 +571,21 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
node = find_config_node(rv, memb->nodeid);
- if (node && !node->new)
+ if (!node) {
+ log_error(ls, "remove member %d invalid",
+ memb->nodeid);
+ return -EFAULT;
+ }
+
+ if (!node->new && !node->gone)
continue;
- if (!node) {
- log_rinfo(ls, "remove member %d", memb->nodeid);
+ release_recover = 0;
+
+ if (node->gone) {
+ release_recover = node->release_recover;
+ log_rinfo(ls, "remove member %d%s", memb->nodeid,
+ release_recover ? " (release_recover)" : "");
} else {
/* removed and re-added */
log_rinfo(ls, "remove member %d comm_seq %u %u",
@@ -584,13 +596,16 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
list_move(&memb->list, &ls->ls_nodes_gone);
remove_remote_member(memb->nodeid);
ls->ls_num_nodes--;
- dlm_lsop_recover_slot(ls, memb);
+ dlm_lsop_recover_slot(ls, memb, release_recover);
}
/* add new members to ls_nodes */
for (i = 0; i < rv->nodes_count; i++) {
node = &rv->nodes[i];
+ if (node->gone)
+ continue;
+
if (dlm_is_member(ls, node->nodeid))
continue;
error = dlm_add_member(ls, node);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index be4240f09abd..3ac020fb8139 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -842,7 +842,7 @@ static void recover_conversion(struct dlm_rsb *r)
*/
if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) ||
((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) {
- log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
+ log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
__func__, lkb->lkb_id, lkb->lkb_grmode,
lkb->lkb_rqmode, lkb->lkb_nodeid,
lkb->lkb_remid, other_lkid, other_grmode);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 5cb3896be826..51daf4acbe31 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
dlm_put_lockspace(ls);
if (error)
- dlm_release_lockspace(lockspace, 0);
+ dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS);
else
error = ls->ls_device.minor;
@@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
{
dlm_lockspace_t *lockspace;
struct dlm_ls *ls;
- int error, force = 0;
+ int error, force = DLM_RELEASE_NO_LOCKS;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
return -ENOENT;
if (params->flags & DLM_USER_LSFLG_FORCEFREE)
- force = 2;
+ force = DLM_RELEASE_NORMAL;
lockspace = ls;
dlm_put_lockspace(ls);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 72fbe1316ab8..abd954c6a14e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out_lock;
}
- rd.old_mnt_idmap = &nop_mnt_idmap;
+ rd.mnt_idmap = &nop_mnt_idmap;
rd.old_parent = lower_old_dir_dentry;
rd.old_dentry = lower_old_dentry;
- rd.new_mnt_idmap = &nop_mnt_idmap;
rd.new_parent = lower_new_dir_dentry;
rd.new_dentry = lower_new_dentry;
rc = vfs_rename(&rd);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 4bb4002e3cdf..1f4d8ce56667 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb);
static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.alloc_inode = efivarfs_alloc_inode,
.free_inode = efivarfs_free_inode,
.show_options = efivarfs_show_options,
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 3b1ba571c728..8ca29962a3dd 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -475,6 +475,10 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations erofs_file_fops = {
.llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
+ .unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = erofs_compat_ioctl,
+#endif
.mmap_prepare = erofs_file_mmap_prepare,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index debf469ad6bd..32b4f5aa60c9 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -123,4 +123,8 @@ const struct file_operations erofs_dir_fops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate_shared = erofs_readdir,
+ .unlocked_ioctl = erofs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = erofs_compat_ioctl,
+#endif
};
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 377ee12b8b96..3d5738f80072 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,10 +12,12 @@
/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
-#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 9a2f59721522..cb780c095d28 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
+#include <linux/compat.h>
#include <trace/events/erofs.h>
static int erofs_fill_symlink(struct inode *inode, void *kaddr,
@@ -213,10 +214,7 @@ static int erofs_fill_inode(struct inode *inode)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- if (erofs_inode_is_data_compressed(vi->datalayout))
- inode->i_fop = &generic_ro_fops;
- else
- inode->i_fop = &erofs_file_fops;
+ inode->i_fop = &erofs_file_fops;
break;
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
@@ -341,6 +339,40 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
return 0;
}
+static int erofs_ioctl_get_volume_label(struct inode *inode, void __user *arg)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ int ret;
+
+ if (!sbi->volume_name)
+ ret = clear_user(arg, 1);
+ else
+ ret = copy_to_user(arg, sbi->volume_name,
+ strlen(sbi->volume_name));
+ return ret ? -EFAULT : 0;
+}
+
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case FS_IOC_GETFSLABEL:
+ return erofs_ioctl_get_volume_label(inode, argp);
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ return erofs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ccc5f0ee8df..f7f622836198 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -153,6 +153,7 @@ struct erofs_sb_info {
/* used for statfs, f_files - f_favail */
u64 inos;
+ char *volume_name;
u32 feature_compat;
u32 feature_incompat;
@@ -234,6 +235,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
{
@@ -535,6 +537,10 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) {
static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
+long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg);
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1b529ace4db0..f3f8d8c066e4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -343,6 +343,13 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
+ if (dsb->volume_name[0]) {
+ sbi->volume_name = kstrndup(dsb->volume_name,
+ sizeof(dsb->volume_name), GFP_KERNEL);
+ if (!sbi->volume_name)
+ return -ENOMEM;
+ }
+
/* parse on-disk compression configurations */
ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
@@ -822,6 +829,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi)
kfree(sbi->domain_id);
if (sbi->dif0.file)
fput(sbi->dif0.file);
+ kfree(sbi->volume_name);
kfree(sbi);
}
@@ -1018,10 +1026,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+#endif
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
+ .evict_inode = erofs_evict_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index eaa9efd766ee..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
struct erofs_xattr_prefix_item *pfs;
int ret = 0, i, len;
+ bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
if (!sbi->xattr_prefix_count)
return 0;
@@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!pfs)
return -ENOMEM;
- if (sbi->packed_inode)
- buf.mapping = sbi->packed_inode->i_mapping;
- else
+ if (!plain) {
+ if (erofs_sb_has_metabox(sbi))
+ (void)erofs_init_metabuf(&buf, sb, true);
+ else if (sbi->packed_inode)
+ buf.mapping = sbi->packed_inode->i_mapping;
+ else
+ plain = true;
+ }
+ if (plain)
(void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 2d73297003d2..bc80cfe482f7 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -823,9 +823,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
}
rcu_read_unlock();
}
- } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
}
if (pcl) {
@@ -1835,7 +1832,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
map->m_la = end;
err = z_erofs_map_blocks_iter(inode, map,
EROFS_GET_BLOCKS_READMORE);
- if (err)
+ if (err || !(map->m_flags & EROFS_MAP_ENCODED))
return;
/* expand ra for the trailing edge if readahead */
@@ -1847,7 +1844,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
- if (!map->m_llen)
+ if (!(map->m_flags & EROFS_MAP_ENCODED) || !map->m_llen)
return;
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a93efd95c555..e5581dbeb4c2 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
.map = map,
.in_mbox = erofs_inode_in_metabox(inode),
};
- int err = 0;
- unsigned int endoff, afmt;
+ unsigned int endoff;
unsigned long initial_lcn;
unsigned long long ofs, end;
+ int err;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
@@ -462,8 +462,8 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map->m_plen);
+ erofs_err(sb, "ztailpacking inline data across blocks @ nid %llu",
+ vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
}
@@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
- Z_EROFS_COMPRESSION_INTERLACED :
- Z_EROFS_COMPRESSION_SHIFTED;
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = vi->z_algorithmtype[1];
} else {
- afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
- vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
- if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
- afmt, vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
+ map->m_algorithmformat = vi->z_algorithmtype[0];
}
- map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err, headnr;
- erofs_off_t pos;
struct z_erofs_map_header *h;
+ erofs_off_t pos;
+ int err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
/*
@@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
- err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
@@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- headnr = 0;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
- vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
- headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
-
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -726,6 +711,30 @@ out_unlock:
return err;
}
+static int z_erofs_map_sanity_check(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED))
+ return 0;
+ if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+ erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+ map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+ !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ map->m_algorithmformat, EROFS_I(inode)->nid);
+ return -EFSCORRUPTED;
+ }
+ if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
@@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
else
err = z_erofs_map_blocks_fo(inode, map, flags);
}
- if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
- unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
- map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
- err = -EOPNOTSUPP;
+ if (!err)
+ err = z_erofs_map_sanity_check(inode, map);
if (err)
map->m_llen = 0;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b22d6f819f78..ee7c4b683ec3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
*
* 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
- rwlock_t lock;
+ spinlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
@@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
{
struct epitem *epi, *nepi;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
wake_up(&ep->wq);
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_get(struct eventpoll *ep)
@@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
rb_erase_cached(&epi->rbn, &ep->rbr);
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
return -ENOMEM;
mutex_init(&ep->mtx);
- rwlock_init(&ep->lock);
+ spin_lock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
#endif /* CONFIG_KCMP */
/*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- * existing list until all changes are completed, in other words
- * concurrent list_add_tail_lockless() calls should be protected
- * with a read lock, where write lock acts as a barrier which
- * makes sure all list_add_tail_lockless() calls are fully
- * completed.
- *
- * Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail or to the head, otherwise
- * concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
- struct list_head *head)
-{
- struct list_head *prev;
-
- /*
- * This is simple 'new->next = head' operation, but cmpxchg()
- * is used in order to detect that same element has been just
- * added to the list from another CPU: the winner observes
- * new->next == new.
- */
- if (!try_cmpxchg(&new->next, &new, head))
- return false;
-
- /*
- * Initially ->next of a new element must be updated with the head
- * (we are inserting to the tail) and only then pointers are atomically
- * exchanged. XCHG guarantees memory ordering, thus ->next should be
- * updated before pointers are actually swapped and pointers are
- * swapped before prev->next is updated.
- */
-
- prev = xchg(&head->prev, new);
-
- /*
- * It is safe to modify prev->next and new->prev, because a new element
- * is added only to the tail and new->next is updated before XCHG.
- */
-
- prev->next = new;
- new->prev = prev;
-
- return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
- struct eventpoll *ep = epi->ep;
-
- /* Fast preliminary check */
- if (epi->next != EP_UNACTIVE_PTR)
- return false;
-
- /* Check that the same epi has not been just chained from another CPU */
- if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
- return false;
-
- /* Atomically exchange tail */
- epi->next = xchg(&ep->ovflist, epi);
-
- return true;
-}
-
-/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries. Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly. This is detected using
- * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
@@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
int ewake = 0;
- read_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (chain_epi_lockless(epi))
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
+ }
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
- if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
- ep_pm_stay_awake_rcu(epi);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- read_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
}
/* We have to drop the new item inside our item list to keep track of it */
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
@@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
@@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !ep_schedule_timeout(to) ||
@@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
@@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
}
}
diff --git a/fs/exec.c b/fs/exec.c
index 2a1e5e4042a1..4a89918b761f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -599,7 +599,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long stack_top,
int executable_stack)
{
- unsigned long ret;
+ int ret;
unsigned long stack_shift;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = bprm->vma;
@@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
{
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
+ if (!error && !write)
validate_coredump_safety();
return error;
}
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 0a056d97e640..cf0a0970c095 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
}
const struct fscrypt_operations ext4_cryptops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
.needs_bounce_pages = 1,
.has_32bit_inodes = 1,
.supports_subblock_data_units = 1,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01a6e2de7fc3..6cb784a56b3b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1182,6 +1182,14 @@ struct ext4_inode_info {
__u32 i_csum_seed;
kprojid_t i_projid;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+#endif
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
};
/*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df4051613b29..ba4fd9aba1c1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
"nonexistent device\n", __func__, __LINE__);
return;
}
- if (atomic_read(&inode->i_count) > 1) {
+ if (icount_read(inode) > 1) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
__func__, __LINE__, inode->i_ino,
- atomic_read(&inode->i_count));
+ icount_read(inode));
return;
}
if (inode->i_nlink) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5898d92ba19f..8b18802e83eb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
list_splice_tail(&freed_data_list, &sbi->s_discard_list);
spin_unlock(&sbi->s_md_lock);
if (wake)
- queue_work(system_unbound_wq, &sbi->s_discard_work);
+ queue_work(system_dfl_wq, &sbi->s_discard_work);
} else {
list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
kmem_cache_free(ext4_free_data_cachep, entry);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 699c15db28a8..7f2d4014d128 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
static int ext4_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
@@ -1470,6 +1470,12 @@ static void init_once(void *foo)
init_rwsem(&ei->i_data_sem);
inode_init_once(&ei->vfs_inode);
ext4_fc_init_inode(&ei->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+ ei->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
static int __init init_inodecache(void)
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index d9203228ce97..b0acb0c50313 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations ext4_verityops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
.begin_enable_verity = ext4_begin_enable_verity,
.end_enable_verity = ext4_end_enable_verity,
.get_verity_descriptor = ext4_get_verity_descriptor,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 46be7560548c..6e465bbc85ee 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -907,6 +907,12 @@ struct f2fs_inode_info {
unsigned int atomic_write_cnt;
loff_t original_i_size; /* original i_size before atomic write */
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */
+#endif
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info; /* filesystem verity info */
+#endif
};
static inline void get_read_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e16c4e2830c2..2619cbbd7d2d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -480,6 +480,12 @@ static void init_once(void *foo)
struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
inode_init_once(&fi->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+ fi->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+ fi->i_verity_info = NULL;
+#endif
}
#ifdef CONFIG_QUOTA
@@ -1744,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode)
if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* to avoid evict_inode call simultaneously */
- atomic_inc(&inode->i_count);
+ __iget(inode);
spin_unlock(&inode->i_lock);
/* should remain fi->extent_tree for writepage */
@@ -1763,12 +1769,12 @@ static int f2fs_drop_inode(struct inode *inode)
sb_end_intwrite(inode->i_sb);
spin_lock(&inode->i_lock);
- atomic_dec(&inode->i_count);
+ iput(inode);
}
trace_f2fs_drop_inode(inode, 0);
return 0;
}
- ret = generic_drop_inode(inode);
+ ret = inode_generic_drop(inode);
if (!ret)
ret = fscrypt_drop_inode(inode);
trace_f2fs_drop_inode(inode, ret);
@@ -3570,6 +3576,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
}
static const struct fscrypt_operations f2fs_cryptops = {
+ .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) -
+ (int)offsetof(struct f2fs_inode_info, vfs_inode),
.needs_bounce_pages = 1,
.has_32bit_inodes = 1,
.supports_subblock_data_units = 1,
@@ -3581,7 +3589,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.has_stable_inodes = f2fs_has_stable_inodes,
.get_devices = f2fs_get_devices,
};
-#endif
+#endif /* CONFIG_FS_ENCRYPTION */
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 2287f238ae09..f0ab9a3c7a82 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations f2fs_verityops = {
+ .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) -
+ (int)offsetof(struct f2fs_inode_info, vfs_inode),
.begin_enable_verity = f2fs_begin_enable_verity,
.end_enable_verity = f2fs_end_enable_verity,
.get_verity_descriptor = f2fs_get_verity_descriptor,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5598e4d57422..72f8433d9109 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint)
}
}
-static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
@@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
return 0;
}
-static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_set_rw_hint(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
@@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
- err = fcntl_get_rw_hint(filp, cmd, arg);
+ err = fcntl_get_rw_hint(filp, arg);
break;
case F_SET_RW_HINT:
- err = fcntl_set_rw_hint(filp, cmd, arg);
+ err = fcntl_set_rw_hint(filp, arg);
break;
default:
break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 68a7d2861c58..052f9c9368fb 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
+#include <linux/nsfs.h>
#include "internal.h"
#include "mount.h"
@@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root)
return 0;
}
+ if (fd == FD_NSFS_ROOT) {
+ nsfs_get_root(root);
+ return 0;
+ }
+
return -EBADF;
}
@@ -208,6 +214,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
return 1;
/*
+ * Verify that the decoded dentry itself has a valid id mapping.
+ * In case the decoded dentry is the mountfd root itself, this
+ * verifies that the mountfd inode itself has a valid id mapping.
+ */
+ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry)))
+ return 0;
+
+ /*
* It's racy as we're not taking rename_lock but we're able to ignore
* permissions and we just need an approximation whether we were able
* to follow a path to the file.
diff --git a/fs/file.c b/fs/file.c
index 6d2275c3be9c..28743b742e3c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
err = expand_files(files, fd);
if (unlikely(err < 0))
goto out_unlock;
- return do_dup2(files, file, fd, flags);
+ err = do_dup2(files, file, fd, flags);
+ if (err < 0)
+ return err;
+ return 0;
out_unlock:
spin_unlock(&files->file_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..2b35e80037fe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct rcu_work work;
+ /* List of queued switching contexts for the wb */
+ struct llist_node list;
/*
* Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
* array embedded into struct inode_switch_wbs_context. Otherwise
* an inode could be left in a non-consistent state.
*/
- struct bdi_writeback *new_wb;
struct inode *inodes[];
};
@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
* the specific list @inode was on is ignored and the @inode is put on
* ->b_dirty which is always correct including from ->b_dirty_time.
- * The transfer preserves @inode->dirtied_when ordering. If the @inode
- * was clean, it means it was on the b_attached list, so move it onto
- * the b_attached list of @new_wb.
+ * If the @inode was clean, it means it was on the b_attached list, so
+ * move it onto the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
inode->i_wb = new_wb;
if (inode->i_state & I_DIRTY_ALL) {
- struct inode *pos;
-
- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
- if (time_after_eq(inode->dirtied_when,
- pos->dirtied_when))
- break;
+ /*
+ * We need to keep b_dirty list sorted by
+ * dirtied_time_when. However properly sorting the
+ * inode in the list gets too expensive when switching
+ * many inodes. So just attach inode at the end of the
+ * dirty list and clobber the dirtied_time_when.
+ */
+ inode->dirtied_time_when = jiffies;
inode_io_list_move_locked(inode, new_wb,
- pos->i_io_list.prev);
+ &new_wb->b_dirty);
} else {
inode_cgwb_move_to_attached(inode, new_wb);
}
@@ -486,13 +487,11 @@ skip_switch:
return switched;
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
+ struct inode_switch_wbs_context *isw)
{
- struct inode_switch_wbs_context *isw =
- container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
- struct bdi_writeback *new_wb = isw->new_wb;
unsigned long nr_switched = 0;
struct inode **inodep;
@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
*/
down_read(&bdi->wb_switch_rwsem);
+ inodep = isw->inodes;
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
+relock:
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
- for (inodep = isw->inodes; *inodep; inodep++) {
+ while (*inodep) {
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
nr_switched++;
+ inodep++;
+ if (*inodep && need_resched()) {
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
+ cond_resched();
+ goto relock;
+ }
}
spin_unlock(&new_wb->list_lock);
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
atomic_dec(&isw_nr_in_flight);
}
+void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
+ switch_work);
+ struct inode_switch_wbs_context *isw, *next_isw;
+ struct llist_node *list;
+
+ /*
+ * Grab out reference to wb so that it cannot get freed under us
+ * after we process all the isw items.
+ */
+ wb_get(new_wb);
+ while (1) {
+ list = llist_del_all(&new_wb->switch_wbs_ctxs);
+ /* Nothing to do? */
+ if (!list)
+ break;
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH
+ * tells the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be
+ * visible.
+ */
+ synchronize_rcu();
+
+ llist_for_each_entry_safe(isw, next_isw, list, list)
+ process_inode_switch_wbs(new_wb, isw);
+ }
+ wb_put(new_wb);
+}
+
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
return true;
}
+static void wb_queue_isw(struct bdi_writeback *wb,
+ struct inode_switch_wbs_context *isw)
+{
+ if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
+ queue_work(isw_wq, &wb->switch_work);
+}
+
/**
* inode_switch_wbs - change the wb association of an inode
* @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
+ struct bdi_writeback *new_wb = NULL;
/* noop if seems to be already in progress */
if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (!memcg_css)
goto out_free;
- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
- if (!isw->new_wb)
+ if (!new_wb)
goto out_free;
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ if (!inode_prepare_wbs_switch(inode, new_wb))
goto out_free;
isw->inodes[0] = inode;
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- */
- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_rcu_work(isw_wq, &isw->work);
+ trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
+ wb_queue_isw(new_wb, isw);
return;
out_free:
atomic_dec(&isw_nr_in_flight);
- if (isw->new_wb)
- wb_put(isw->new_wb);
+ if (new_wb)
+ wb_put(new_wb);
kfree(isw);
}
-static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
+ struct inode_switch_wbs_context *isw,
struct list_head *list, int *nr)
{
struct inode *inode;
list_for_each_entry(inode, list, i_io_list) {
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ if (!inode_prepare_wbs_switch(inode, new_wb))
continue;
isw->inodes[*nr] = inode;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
+ struct bdi_writeback *new_wb;
int nr;
bool restart = false;
@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
for (memcg_css = wb->memcg_css->parent; memcg_css;
memcg_css = memcg_css->parent) {
- isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
- if (isw->new_wb)
+ new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+ if (new_wb)
break;
}
- if (unlikely(!isw->new_wb))
- isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+ if (unlikely(!new_wb))
+ new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
nr = 0;
spin_lock(&wb->list_lock);
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* bandwidth restrictions, as writeback of inode metadata is not
* accounted for.
*/
- restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
if (!restart)
- restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
+ restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
+ &nr);
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
if (nr == 0) {
atomic_dec(&isw_nr_in_flight);
- wb_put(isw->new_wb);
+ wb_put(new_wb);
kfree(isw);
return restart;
}
- /*
- * In addition to synchronizing among switchers, I_WB_SWITCH tells
- * the RCU protected stat update paths to grab the i_page
- * lock so that stat transfer can synchronize against them.
- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- */
- INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_rcu_work(isw_wq, &isw->work);
+ trace_inode_switch_wbs_queue(wb, new_wb, nr);
+ wb_queue_isw(new_wb, isw);
return restart;
}
@@ -1123,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
dirty = dirty * 10 / 8;
/* issue the writeback work */
- work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+ work = kzalloc(sizeof(*work), GFP_NOWAIT);
if (work) {
work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
@@ -1180,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb)
static int __init cgroup_writeback_init(void)
{
- isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0);
if (!isw_wq)
return -ENOMEM;
return 0;
@@ -1767,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode,
int ret = 0;
spin_lock(&inode->i_lock);
- if (!atomic_read(&inode->i_count))
+ if (!icount_read(inode))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
WARN_ON(inode->i_state & I_WILL_FREE);
@@ -2442,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
- mod_delayed_work(system_wq, &dirtytime_work, 0);
+ mod_delayed_work(system_percpu_wq, &dirtytime_work, 0);
return ret;
}
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 1aaf4cb2afb2..f645c99204eb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -18,50 +18,56 @@
#include "internal.h"
#include "mount.h"
+static inline const char *fetch_message_locked(struct fc_log *log, size_t len,
+ bool *need_free)
+{
+ const char *p;
+ int index;
+
+ if (unlikely(log->head == log->tail))
+ return ERR_PTR(-ENODATA);
+
+ index = log->tail & (ARRAY_SIZE(log->buffer) - 1);
+ p = log->buffer[index];
+ if (unlikely(strlen(p) > len))
+ return ERR_PTR(-EMSGSIZE);
+
+ log->buffer[index] = NULL;
+ *need_free = log->need_free & (1 << index);
+ log->need_free &= ~(1 << index);
+ log->tail++;
+
+ return p;
+}
+
/*
* Allow the user to read back any error, warning or informational messages.
+ * Only one message is returned for each read(2) call.
*/
static ssize_t fscontext_read(struct file *file,
char __user *_buf, size_t len, loff_t *pos)
{
struct fs_context *fc = file->private_data;
- struct fc_log *log = fc->log.log;
- unsigned int logsize = ARRAY_SIZE(log->buffer);
- ssize_t ret;
- char *p;
+ ssize_t err;
+ const char *p __free(kfree) = NULL, *message;
bool need_free;
- int index, n;
+ int n;
- ret = mutex_lock_interruptible(&fc->uapi_mutex);
- if (ret < 0)
- return ret;
-
- if (log->head == log->tail) {
- mutex_unlock(&fc->uapi_mutex);
- return -ENODATA;
- }
-
- index = log->tail & (logsize - 1);
- p = log->buffer[index];
- need_free = log->need_free & (1 << index);
- log->buffer[index] = NULL;
- log->need_free &= ~(1 << index);
- log->tail++;
+ err = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (err < 0)
+ return err;
+ message = fetch_message_locked(fc->log.log, len, &need_free);
mutex_unlock(&fc->uapi_mutex);
+ if (IS_ERR(message))
+ return PTR_ERR(message);
- ret = -EMSGSIZE;
- n = strlen(p);
- if (n > len)
- goto err_free;
- ret = -EFAULT;
- if (copy_to_user(_buf, p, n) != 0)
- goto err_free;
- ret = n;
-
-err_free:
if (need_free)
- kfree(p);
- return ret;
+ p = message;
+
+ n = strlen(message);
+ if (copy_to_user(_buf, message, n))
+ return -EFAULT;
+ return n;
}
static int fscontext_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..66a1ba8c56b5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work)
goto abort_conn;
out:
- queue_delayed_work(system_wq, &fc->timeout.work,
+ queue_delayed_work(system_percpu_wq, &fc->timeout.work,
fuse_timeout_timer_freq);
return;
@@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num) {
+ while (num && ap->num_folios < num_pages) {
struct folio *folio;
unsigned int folio_offset;
unsigned int nr_bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2d817d7cab26..5c569c3cb53f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = inode->i_sb->s_blocksize_bits;
+ blkbits = fc->blkbits;
stat->blksize = 1 << blkbits;
}
@@ -1377,6 +1377,7 @@ retry:
generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
+ stat->blksize = 1 << fi->cached_i_blkbits;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
stat->btime = fi->i_btime;
stat->result_mask |= STATX_BTIME;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5525a4520b0f..4adcf09d4b01 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
- .len = len,
+ .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
.flags = flags
};
struct fuse_write_out outarg;
@@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
}
+ if (!err && outarg.size > len)
+ err = -EIO;
+
if (err)
goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ec248d13c8bf..cc428d04be3e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -210,6 +210,12 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+ /*
+ * The underlying inode->i_blkbits value will not be modified,
+ * so preserve the blocksize specified by the server.
+ */
+ u8 cached_i_blkbits;
};
/** FUSE inode state bits */
@@ -969,6 +975,14 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
+
+ /*
+ * This is a workaround until fuse uses iomap for reads.
+ * For fuseblk servers, this represents the blocksize passed in at
+ * mount time and for regular fuse servers, this is equivalent to
+ * inode->i_blkbits.
+ */
+ u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 67c2318bfc42..7485a41af892 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
}
}
+ if (attr->blksize)
+ fi->cached_i_blkbits = ilog2(attr->blksize);
+ else
+ fi->cached_i_blkbits = fc->blkbits;
+
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
* to check permissions. This prevents failures due to the
@@ -1204,7 +1209,7 @@ static const struct super_operations fuse_super_operations = {
.free_inode = fuse_free_inode,
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
@@ -1268,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
{
fc->timeout.req_timeout = secs_to_jiffies(timeout);
INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
- queue_delayed_work(system_wq, &fc->timeout.work,
+ queue_delayed_work(system_percpu_wq, &fc->timeout.work,
fuse_timeout_timer_freq);
}
@@ -1805,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
+ /*
+ * This is a workaround until fuse hooks into iomap for reads.
+ * Use PAGE_SIZE for the blocksize else if the writeback cache
+ * is enabled, buffered writes go through iomap and a read may
+ * overwrite partially written data if blocksize < PAGE_SIZE
+ */
+ fc->blkbits = sb->s_blocksize_bits;
+ if (ctx->blksize != PAGE_SIZE &&
+ !sb_set_blocksize(sb, PAGE_SIZE))
+ goto err;
#endif
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
+ fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 607ef735ad4a..eb97ac009e75 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
if (!file)
goto out;
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
backing_sb = file_inode(file)->i_sb;
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index c826e7ca49f5..76c8fd0bfc75 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = fs->window_phys_addr + offset;
+ *pfn = PHYS_PFN(fs->window_phys_addr + offset);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 72d95185a39f..bc67fa058c84 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1442,6 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int ret;
if (!(fl->c.flc_flags & FL_POSIX))
return -ENOLCK;
@@ -1450,14 +1451,20 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
locks_lock_file_wait(file, fl);
return -EIO;
}
- if (cmd == F_CANCELLK)
- return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
- else if (IS_GETLK(cmd))
- return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
- else if (lock_is_unlock(fl))
- return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
- else
- return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+ down_read(&ls->ls_sem);
+ ret = -ENODEV;
+ if (likely(ls->ls_dlm != NULL)) {
+ if (cmd == F_CANCELLK)
+ ret = dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
+ else if (IS_GETLK(cmd))
+ ret = dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
+ else if (lock_is_unlock(fl))
+ ret = dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
+ else
+ ret = dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+ }
+ up_read(&ls->ls_sem);
+ return ret;
}
static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b6fd1cb17de7..b677c0e6b9ab 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -481,11 +481,9 @@ done:
/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
- *
- * Returns true on success (i.e., progress was made or there are no waiters).
*/
-static bool do_promote(struct gfs2_glock *gl)
+static void do_promote(struct gfs2_glock *gl)
{
struct gfs2_holder *gh, *current_gh;
@@ -496,13 +494,10 @@ static bool do_promote(struct gfs2_glock *gl)
if (!may_grant(gl, current_gh, gh)) {
/*
* If we get here, it means we may not grant this
- * holder for some reason. If this holder is at the
- * head of the list, it means we have a blocked holder
- * at the head, so return false.
+ * holder for some reason.
*/
- if (list_is_first(&gh->gh_list, &gl->gl_holders))
- return false;
- do_error(gl, 0);
+ if (current_gh)
+ do_error(gl, 0); /* Fail queued try locks */
break;
}
set_bit(HIF_HOLDER, &gh->gh_iflags);
@@ -511,7 +506,6 @@ static bool do_promote(struct gfs2_glock *gl)
if (!current_gh)
current_gh = gh;
}
- return true;
}
/**
@@ -646,8 +640,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
}
/* Fast path - we got what we asked for */
- if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+ clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
gfs2_demote_wake(gl);
+ }
if (gl->gl_state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
int rv;
@@ -693,54 +689,33 @@ __acquires(&gl->gl_lockref.lock)
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
gh && !(gh->gh_flags & LM_FLAG_NOEXP))
goto skip_inval;
- lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP);
GLOCK_BUG_ON(gl, gl->gl_state == target);
GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
- if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
- glops->go_inval) {
- /*
- * If another process is already doing the invalidate, let that
- * finish first. The glock state machine will get back to this
- * holder again later.
- */
- if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS,
- &gl->gl_flags))
- return;
- do_error(gl, 0); /* Fail queued try locks */
- }
- gl->gl_req = target;
- set_bit(GLF_BLOCKING, &gl->gl_flags);
- if ((gl->gl_req == LM_ST_UNLOCKED) ||
- (gl->gl_state == LM_ST_EXCLUSIVE) ||
- (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
- clear_bit(GLF_BLOCKING, &gl->gl_flags);
- if (!glops->go_inval && !glops->go_sync)
+ if (!glops->go_inval || !glops->go_sync)
goto skip_inval;
spin_unlock(&gl->gl_lockref.lock);
- if (glops->go_sync) {
- ret = glops->go_sync(gl);
- /* If we had a problem syncing (due to io errors or whatever,
- * we should not invalidate the metadata or tell dlm to
- * release the glock to other nodes.
- */
- if (ret) {
- if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
- fs_err(sdp, "Error %d syncing glock \n", ret);
- gfs2_dump_glock(NULL, gl, true);
- }
- spin_lock(&gl->gl_lockref.lock);
- goto skip_inval;
+ ret = glops->go_sync(gl);
+ /* If we had a problem syncing (due to io errors or whatever,
+ * we should not invalidate the metadata or tell dlm to
+ * release the glock to other nodes.
+ */
+ if (ret) {
+ if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
+ fs_err(sdp, "Error %d syncing glock\n", ret);
+ gfs2_dump_glock(NULL, gl, true);
}
+ spin_lock(&gl->gl_lockref.lock);
+ goto skip_inval;
}
- if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) {
+
+ if (target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) {
/*
* The call to go_sync should have cleared out the ail list.
* If there are still items, we have a problem. We ought to
@@ -755,12 +730,10 @@ __acquires(&gl->gl_lockref.lock)
gfs2_dump_glock(NULL, gl, true);
}
glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
- clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
spin_lock(&gl->gl_lockref.lock);
skip_inval:
- gl->gl_lockref.count++;
/*
* Check for an error encountered since we called go_sync and go_inval.
* If so, we can't withdraw from the glock code because the withdraw
@@ -803,38 +776,41 @@ skip_inval:
if (!test_bit(GLF_CANCELING, &gl->gl_flags))
clear_bit(GLF_LOCK, &gl->gl_flags);
clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+ gl->gl_lockref.count++;
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
return;
- } else {
- clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
}
if (ls->ls_ops->lm_lock) {
set_bit(GLF_PENDING_REPLY, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
- ret = ls->ls_ops->lm_lock(gl, target, lck_flags);
+ ret = ls->ls_ops->lm_lock(gl, target, gh ? gh->gh_flags : 0);
spin_lock(&gl->gl_lockref.lock);
- if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED &&
- target == LM_ST_UNLOCKED &&
- test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
+ if (!ret) {
+ /* The operation will be completed asynchronously. */
+ gl->gl_lockref.count++;
+ return;
+ }
+ clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
+
+ if (ret == -ENODEV && gl->gl_target == LM_ST_UNLOCKED &&
+ target == LM_ST_UNLOCKED) {
/*
* The lockspace has been released and the lock has
* been unlocked implicitly.
*/
- } else if (ret) {
- fs_err(sdp, "lm_lock ret %d\n", ret);
- target = gl->gl_state | LM_OUT_ERROR;
} else {
- /* The operation will be completed asynchronously. */
+ fs_err(sdp, "lm_lock ret %d\n", ret);
+ GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp));
return;
}
- clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
}
/* Complete the operation now. */
finish_xmote(gl, target);
+ gl->gl_lockref.count++;
gfs2_glock_queue_work(gl, 0);
}
@@ -855,11 +831,20 @@ __acquires(&gl->gl_lockref.lock)
return;
set_bit(GLF_LOCK, &gl->gl_flags);
- /* While a demote is in progress, the GLF_LOCK flag must be set. */
+ /*
+ * The GLF_DEMOTE_IN_PROGRESS flag is only set intermittently during
+ * locking operations. We have just started a locking operation by
+ * setting the GLF_LOCK flag, so the GLF_DEMOTE_IN_PROGRESS flag must
+ * be cleared.
+ */
GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
- if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
- gl->gl_demote_state != gl->gl_state) {
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+ if (gl->gl_demote_state == gl->gl_state) {
+ gfs2_demote_wake(gl);
+ goto promote;
+ }
+
if (find_first_holder(gl))
goto out_unlock;
if (nonblock)
@@ -869,31 +854,31 @@ __acquires(&gl->gl_lockref.lock)
gl->gl_target = gl->gl_demote_state;
do_xmote(gl, NULL, gl->gl_target);
return;
- } else {
- if (test_bit(GLF_DEMOTE, &gl->gl_flags))
- gfs2_demote_wake(gl);
- if (do_promote(gl))
- goto out_unlock;
- gh = find_first_waiter(gl);
- if (!gh)
- goto out_unlock;
- gl->gl_target = gh->gh_state;
- if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
- do_error(gl, 0); /* Fail queued try locks */
- do_xmote(gl, gh, gl->gl_target);
- return;
}
+promote:
+ do_promote(gl);
+ if (find_first_holder(gl))
+ goto out_unlock;
+ gh = find_first_waiter(gl);
+ if (!gh)
+ goto out_unlock;
+ if (nonblock)
+ goto out_sched;
+ gl->gl_target = gh->gh_state;
+ if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ do_error(gl, 0); /* Fail queued try locks */
+ do_xmote(gl, gh, gl->gl_target);
+ return;
+
out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
- smp_mb__after_atomic();
gl->gl_lockref.count++;
gfs2_glock_queue_work(gl, 0);
return;
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
- smp_mb__after_atomic();
}
/**
@@ -1462,6 +1447,24 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
va_end(args);
}
+static bool gfs2_should_queue_trylock(struct gfs2_glock *gl,
+ struct gfs2_holder *gh)
+{
+ struct gfs2_holder *current_gh, *gh2;
+
+ current_gh = find_first_holder(gl);
+ if (current_gh && !may_grant(gl, current_gh, gh))
+ return false;
+
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+ continue;
+ if (!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ return false;
+ }
+ return true;
+}
+
static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
{
if (!(gh->gh_flags & GL_NOPID))
@@ -1480,27 +1483,20 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
*/
static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_lockref.lock)
-__acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_holder *gh2;
- int try_futile = 0;
GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL);
if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
GLOCK_BUG_ON(gl, true);
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- if (test_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder *current_gh;
-
- current_gh = find_first_holder(gl);
- try_futile = !may_grant(gl, current_gh, gh);
- }
- if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
- goto fail;
+ if ((gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+ !gfs2_should_queue_trylock(gl, gh)) {
+ gh->gh_error = GLR_TRYFAILED;
+ gfs2_holder_wake(gh);
+ return;
}
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
@@ -1512,15 +1508,6 @@ __acquires(&gl->gl_lockref.lock)
continue;
goto trap_recursive;
}
- list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
- if (try_futile &&
- !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
-fail:
- gh->gh_error = GLR_TRYFAILED;
- gfs2_holder_wake(gh);
- return;
- }
- }
trace_gfs2_glock_queue(gh, 1);
gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
@@ -2321,8 +2308,6 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'y';
if (test_bit(GLF_LFLUSH, gflags))
*p++ = 'f';
- if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
- *p++ = 'i';
if (test_bit(GLF_PENDING_REPLY, gflags))
*p++ = 'R';
if (test_bit(GLF_HAVE_REPLY, gflags))
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 9339a3bff6ee..d041b922b45e 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -68,6 +68,10 @@ enum {
* also be granted in SHARED. The preferred state is whichever is compatible
* with other granted locks, or the specified state if no other locks exist.
*
+ * In addition, when a lock is already held in EX mode locally, a SHARED or
+ * DEFERRED mode request with the LM_FLAG_ANY flag set will be granted.
+ * (The LM_FLAG_ANY flag is only use for SHARED mode requests currently.)
+ *
* LM_FLAG_NODE_SCOPE
* This holder agrees to share the lock within this node. In other words,
* the glock is held in EX mode according to DLM, but local holders on the
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d4ad82f47eee..5a0ea416cfda 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -319,7 +319,6 @@ enum {
GLF_DEMOTE_IN_PROGRESS = 5,
GLF_DIRTY = 6,
GLF_LFLUSH = 7,
- GLF_INVALIDATE_IN_PROGRESS = 8,
GLF_HAVE_REPLY = 9,
GLF_INITIAL = 10,
GLF_HAVE_FROZEN_REPLY = 11,
@@ -376,7 +375,6 @@ struct gfs2_glock {
enum {
GIF_QD_LOCKED = 1,
GIF_SW_PAGED = 3,
- GIF_FREE_VFS_INODE = 5,
GIF_GLOP_PENDING = 6,
};
@@ -658,6 +656,8 @@ struct lm_lockstruct {
struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
char *ls_lvb_bits;
+ struct rw_semaphore ls_sem;
+
spinlock_t ls_recover_spin; /* protects following fields */
unsigned long ls_recover_flags; /* DFL_ */
uint32_t ls_recover_mount; /* gen in first recover_done cb */
@@ -823,7 +823,6 @@ struct gfs2_sbd {
atomic_t sd_log_in_flight;
wait_queue_head_t sd_log_flush_wait;
int sd_log_error; /* First log error */
- wait_queue_head_t sd_withdraw_wait;
unsigned int sd_log_tail;
unsigned int sd_log_flush_tail;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index cee5d199d2d8..4f00af7dd256 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -58,6 +58,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
/**
* gfs2_update_reply_times - Update locking statistics
* @gl: The glock to update
+ * @blocking: The operation may have been blocking
*
* This assumes that gl->gl_dstamp has been set earlier.
*
@@ -72,12 +73,12 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
* TRY_1CB flags are set are classified as non-blocking. All
* other DLM requests are counted as (potentially) blocking.
*/
-static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
+static inline void gfs2_update_reply_times(struct gfs2_glock *gl,
+ bool blocking)
{
struct gfs2_pcpu_lkstats *lks;
const unsigned gltype = gl->gl_name.ln_type;
- unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ?
- GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
+ unsigned index = blocking ? GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
s64 rtt;
preempt_disable();
@@ -119,14 +120,18 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
static void gdlm_ast(void *arg)
{
struct gfs2_glock *gl = arg;
+ bool blocking;
unsigned ret;
+ blocking = test_bit(GLF_BLOCKING, &gl->gl_flags);
+ gfs2_update_reply_times(gl, blocking);
+ clear_bit(GLF_BLOCKING, &gl->gl_flags);
+
/* If the glock is dead, we only react to a dlm_unlock() reply. */
if (__lockref_is_dead(&gl->gl_lockref) &&
gl->gl_lksb.sb_status != -DLM_EUNLOCK)
return;
- gfs2_update_reply_times(gl);
BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
@@ -157,14 +162,6 @@ static void gdlm_ast(void *arg)
}
ret = gl->gl_req;
- if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
- if (gl->gl_req == LM_ST_SHARED)
- ret = LM_ST_DEFERRED;
- else if (gl->gl_req == LM_ST_DEFERRED)
- ret = LM_ST_SHARED;
- else
- BUG();
- }
/*
* The GLF_INITIAL flag is initially set for new glocks. Upon the
@@ -241,7 +238,7 @@ static bool down_conversion(int cur, int req)
}
static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
- const int cur, const int req)
+ const int req, bool blocking)
{
u32 lkf = 0;
@@ -256,15 +253,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
lkf |= DLM_LKF_NOQUEUEBAST;
}
- if (gfs_flags & LM_FLAG_ANY) {
- if (req == DLM_LOCK_PR)
- lkf |= DLM_LKF_ALTCW;
- else if (req == DLM_LOCK_CW)
- lkf |= DLM_LKF_ALTPR;
- else
- BUG();
- }
-
if (!test_bit(GLF_INITIAL, &gl->gl_flags)) {
lkf |= DLM_LKF_CONVERT;
@@ -274,7 +262,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
* "upward" lock conversions or else DLM will reject the
* request as invalid.
*/
- if (!down_conversion(cur, req))
+ if (blocking)
lkf |= DLM_LKF_QUECVT;
}
@@ -294,14 +282,20 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
unsigned int flags)
{
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
+ bool blocking;
int cur, req;
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
int error;
+ gl->gl_req = req_state;
cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state);
req = make_mode(gl->gl_name.ln_sbd, req_state);
- lkf = make_flags(gl, flags, cur, req);
+ blocking = !down_conversion(cur, req) &&
+ !(flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB));
+ lkf = make_flags(gl, flags, req, blocking);
+ if (blocking)
+ set_bit(GLF_BLOCKING, &gl->gl_flags);
gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
if (test_bit(GLF_INITIAL, &gl->gl_flags)) {
@@ -318,8 +312,13 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
*/
again:
- error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
- GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+ down_read(&ls->ls_sem);
+ error = -ENODEV;
+ if (likely(ls->ls_dlm != NULL)) {
+ error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
+ GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+ }
+ up_read(&ls->ls_sem);
if (error == -EBUSY) {
msleep(20);
goto again;
@@ -341,17 +340,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
return;
}
- clear_bit(GLF_BLOCKING, &gl->gl_flags);
gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_update_request_times(gl);
- /* don't want to call dlm if we've unmounted the lock protocol */
- if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
- gfs2_glock_free(gl);
- return;
- }
-
/*
* When the lockspace is released, all remaining glocks will be
* unlocked automatically. This is more efficient than unlocking them
@@ -369,13 +361,23 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
flags |= DLM_LKF_VALBLK;
again:
- error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags,
- NULL, gl);
+ down_read(&ls->ls_sem);
+ error = -ENODEV;
+ if (likely(ls->ls_dlm != NULL)) {
+ error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags,
+ NULL, gl);
+ }
+ up_read(&ls->ls_sem);
if (error == -EBUSY) {
msleep(20);
goto again;
}
+ if (error == -ENODEV) {
+ gfs2_glock_free(gl);
+ return;
+ }
+
if (error) {
fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
gl->gl_name.ln_type,
@@ -386,7 +388,12 @@ again:
static void gdlm_cancel(struct gfs2_glock *gl)
{
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
- dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+
+ down_read(&ls->ls_sem);
+ if (likely(ls->ls_dlm != NULL)) {
+ dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+ }
+ up_read(&ls->ls_sem);
}
/*
@@ -567,7 +574,11 @@ static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int error;
- error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+ down_read(&ls->ls_sem);
+ error = -ENODEV;
+ if (likely(ls->ls_dlm != NULL))
+ error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
+ up_read(&ls->ls_sem);
if (error) {
fs_err(sdp, "%s lkid %x error %d\n",
name, lksb->sb_lkid, error);
@@ -594,9 +605,14 @@ static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
memset(strname, 0, GDLM_STRNAME_BYTES);
snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
- error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
- strname, GDLM_STRNAME_BYTES - 1,
- 0, sync_wait_cb, ls, NULL);
+ down_read(&ls->ls_sem);
+ error = -ENODEV;
+ if (likely(ls->ls_dlm != NULL)) {
+ error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+ strname, GDLM_STRNAME_BYTES - 1,
+ 0, sync_wait_cb, ls, NULL);
+ }
+ up_read(&ls->ls_sem);
if (error) {
fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
name, lksb->sb_lkid, flags, mode, error);
@@ -1323,6 +1339,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
*/
INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+ ls->ls_dlm = NULL;
spin_lock_init(&ls->ls_recover_spin);
ls->ls_recover_flags = 0;
ls->ls_recover_mount = 0;
@@ -1357,6 +1374,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
* create/join lockspace
*/
+ init_rwsem(&ls->ls_sem);
error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
&gdlm_lockspace_ops, sdp, &ops_result,
&ls->ls_dlm);
@@ -1400,7 +1418,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
return 0;
fail_release:
- dlm_release_lockspace(ls->ls_dlm, 2);
+ dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL);
fail_free:
free_recover_size(ls);
fail:
@@ -1436,10 +1454,12 @@ static void gdlm_unmount(struct gfs2_sbd *sdp)
/* mounted_lock and control_lock will be purged in dlm recovery */
release:
+ down_write(&ls->ls_sem);
if (ls->ls_dlm) {
- dlm_release_lockspace(ls->ls_dlm, 2);
+ dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL);
ls->ls_dlm = NULL;
}
+ up_write(&ls->ls_sem);
free_recover_size(ls);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0727f60ad028..9d65719353fa 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void)
error = -ENOMEM;
gfs2_recovery_wq = alloc_workqueue("gfs2_recovery",
- WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU,
+ 0);
if (!gfs2_recovery_wq)
goto fail_wq1;
@@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_control_wq)
goto fail_wq2;
- gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0);
+ gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0);
if (!gfs2_freeze_wq)
goto fail_wq3;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index efe99b732551..aa15183f9a16 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
error = -ENOMEM;
sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0,
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU,
+ 0,
sdp->sd_fsname);
if (!sdp->sd_glock_wq)
goto fail_iput;
sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname);
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0,
+ sdp->sd_fsname);
if (!sdp->sd_delete_wq)
goto fail_glock_wq;
@@ -1754,7 +1756,7 @@ static void gfs2_evict_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
continue;
}
- atomic_inc(&inode->i_count);
+ __iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b42e2110084b..644b2d1e7276 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode)
if (test_bit(SDF_EVICTING, &sdp->sd_flags))
return 1;
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/**
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 26036ffc3f33..1c2507a27318 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -52,7 +52,6 @@
{(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \
{(1UL << GLF_DIRTY), "y" }, \
{(1UL << GLF_LFLUSH), "f" }, \
- {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
{(1UL << GLF_PENDING_REPLY), "R" }, \
{(1UL << GLF_HAVE_REPLY), "r" }, \
{(1UL << GLF_INITIAL), "a" }, \
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 24864a66074b..56412f63f3bb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -309,7 +309,7 @@ void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...)
va_end(args);
}
-int gfs2_withdraw(struct gfs2_sbd *sdp)
+void gfs2_withdraw(struct gfs2_sbd *sdp)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
const struct lm_lockops *lm = ls->ls_ops;
@@ -322,7 +322,7 @@ int gfs2_withdraw(struct gfs2_sbd *sdp)
wait_on_bit(&sdp->sd_flags,
SDF_WITHDRAW_IN_PROG,
TASK_UNINTERRUPTIBLE);
- return -1;
+ return;
}
new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG);
} while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new)));
@@ -350,8 +350,6 @@ int gfs2_withdraw(struct gfs2_sbd *sdp)
if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
-
- return -1;
}
/*
@@ -473,46 +471,36 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
/*
* gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * -2 if it was already withdrawn
*/
-int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
- const char *function, char *file,
- unsigned int line)
+void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ const char *function, char *file,
+ unsigned int line)
{
- int me;
-
gfs2_lm(sdp,
"fatal: invalid metadata block - "
"bh = %llu (bad magic number), "
"function = %s, file = %s, line = %u\n",
(unsigned long long)bh->b_blocknr,
function, file, line);
- me = gfs2_withdraw(sdp);
- return (me) ? -1 : -2;
+ gfs2_withdraw(sdp);
}
/*
* gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * -2 if it was already withdrawn
*/
-int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
- u16 type, u16 t, const char *function,
- char *file, unsigned int line)
+void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ u16 type, u16 t, const char *function,
+ char *file, unsigned int line)
{
- int me;
-
gfs2_lm(sdp,
"fatal: invalid metadata block - "
"bh = %llu (type: exp=%u, found=%u), "
"function = %s, file = %s, line = %u\n",
(unsigned long long)bh->b_blocknr, type, t,
function, file, line);
- me = gfs2_withdraw(sdp);
- return (me) ? -1 : -2;
+ gfs2_withdraw(sdp);
}
/*
@@ -521,14 +509,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
* 0 if it was already withdrawn
*/
-int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
- unsigned int line)
+void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+ unsigned int line)
{
gfs2_lm(sdp,
"fatal: I/O error - "
"function = %s, file = %s, line = %u\n",
function, file, line);
- return gfs2_withdraw(sdp);
+ gfs2_withdraw(sdp);
}
/*
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 27d03b641024..da0373b1e82b 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -91,9 +91,9 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__)
-int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
- const char *function,
- char *file, unsigned int line);
+void gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ const char *function,
+ char *file, unsigned int line);
static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
struct buffer_head *bh)
@@ -108,10 +108,10 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
return 0;
}
-int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
- u16 type, u16 t,
- const char *function,
- char *file, unsigned int line);
+void gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ u16 type, u16 t,
+ const char *function,
+ char *file, unsigned int line);
static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
struct buffer_head *bh,
@@ -122,12 +122,16 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
u32 magic = be32_to_cpu(mh->mh_magic);
u16 t = be32_to_cpu(mh->mh_type);
- if (unlikely(magic != GFS2_MAGIC))
- return gfs2_meta_check_ii(sdp, bh, function,
- file, line);
- if (unlikely(t != type))
- return gfs2_metatype_check_ii(sdp, bh, type, t, function,
- file, line);
+ if (unlikely(magic != GFS2_MAGIC)) {
+ gfs2_meta_check_ii(sdp, bh, function,
+ file, line);
+ return -EIO;
+ }
+ if (unlikely(t != type)) {
+ gfs2_metatype_check_ii(sdp, bh, type, t, function,
+ file, line);
+ return -EIO;
+ }
return 0;
}
@@ -144,8 +148,8 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
}
-int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
- char *file, unsigned int line);
+void gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+ char *file, unsigned int line);
int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
bool verbose);
@@ -228,6 +232,6 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
__printf(2, 3)
void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...);
-int gfs2_withdraw(struct gfs2_sbd *sdp);
+void gfs2_withdraw(struct gfs2_sbd *sdp);
#endif /* __UTIL_DOT_H__ */
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 34e9804e0f36..c2f840c49e60 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -21,12 +21,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->tree = tree;
fd->bnode = NULL;
- ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+ ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
fd->search_key = ptr;
fd->key = ptr + tree->max_key_len + 2;
- hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+ hfs_dbg("cnid %d, caller %ps\n",
tree->cnid, __builtin_return_address(0));
switch (tree->cnid) {
case HFS_CAT_CNID:
@@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
{
hfs_bnode_put(fd->bnode);
kfree(fd->search_key);
- hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+ hfs_dbg("cnid %d, caller %ps\n",
fd->tree->cnid, __builtin_return_address(0));
mutex_unlock(&fd->tree->tree_lock);
fd->tree = NULL;
@@ -115,6 +115,12 @@ int hfs_brec_find(struct hfs_find_data *fd)
__be32 data;
int height, res;
+ fd->record = -1;
+ fd->keyoffset = -1;
+ fd->keylength = -1;
+ fd->entryoffset = -1;
+ fd->entrylength = -1;
+
tree = fd->tree;
if (fd->bnode)
hfs_bnode_put(fd->bnode);
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index 28307bc9ec1e..5e84833a4743 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
}
}
- hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
+ hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits);
HFS_SB(sb)->free_ablocks -= *num_bits;
hfs_bitmap_dirty(sb);
out:
@@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
if (!count)
return 0;
- hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count);
+ hfs_dbg("start %u, count %u\n", start, count);
/* are all of the bits in range? */
if ((start + count) > HFS_SB(sb)->fs_ablocks)
return -2;
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index e8cd1a31f247..fcfffe75d84e 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
{
struct page *src_page, *dst_page;
- hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
if (!len)
return;
@@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
struct page *page;
void *ptr;
- hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
if (!len)
return;
@@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
__be32 cnid;
int i, off, key_off;
- hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
+ hfs_dbg("node %d\n", node->this);
hfs_bnode_read(node, &desc, 0, sizeof(desc));
- hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
+ hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n",
be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
desc.type, desc.height, be16_to_cpu(desc.num_recs));
off = node->tree->node_size - 2;
for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
key_off = hfs_bnode_read_u16(node, off);
- hfs_dbg_cont(BNODE_MOD, " %d", key_off);
+ hfs_dbg(" key_off %d", key_off);
if (i && node->type == HFS_NODE_INDEX) {
int tmp;
@@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node)
tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
else
tmp = node->tree->max_key_len + 1;
- hfs_dbg_cont(BNODE_MOD, " (%d,%d",
- tmp, hfs_bnode_read_u8(node, key_off));
+ hfs_dbg(" (%d,%d",
+ tmp, hfs_bnode_read_u8(node, key_off));
hfs_bnode_read(node, &cnid, key_off + tmp, 4);
- hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+ hfs_dbg(", cnid %d)", be32_to_cpu(cnid));
} else if (i && node->type == HFS_NODE_LEAF) {
int tmp;
tmp = hfs_bnode_read_u8(node, key_off);
- hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
+ hfs_dbg(" (%d)", tmp);
}
}
- hfs_dbg_cont(BNODE_MOD, "\n");
+ hfs_dbg("\n");
}
void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
node->this = cnid;
set_bit(HFS_BNODE_NEW, &node->flags);
atomic_set(&node->refcnt, 1);
- hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+ hfs_dbg("cnid %d, node %d, refcnt 1\n",
node->tree->cnid, node->this);
init_waitqueue_head(&node->lock_wq);
spin_lock(&tree->hash_lock);
@@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
{
struct hfs_bnode **p;
- hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
+ hfs_dbg("cnid %d, node %d, refcnt %d\n",
node->tree->cnid, node->this, atomic_read(&node->refcnt));
for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
*p && *p != node; p = &(*p)->next_hash)
@@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node)
{
if (node) {
atomic_inc(&node->refcnt);
- hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+ hfs_dbg("cnid %d, node %d, refcnt %d\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
}
@@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
struct hfs_btree *tree = node->tree;
int i;
- hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+ hfs_dbg("cnid %d, node %d, refcnt %d\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
BUG_ON(!atomic_read(&node->refcnt));
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 896396554bcc..e49a141c87e5 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -94,7 +94,7 @@ again:
end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
end_off = hfs_bnode_read_u16(node, end_rec_off);
end_rec_off -= 2;
- hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+ hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n",
rec, size, end_off, end_rec_off);
if (size > end_rec_off - end_off) {
if (new_node)
@@ -179,6 +179,7 @@ int hfs_brec_remove(struct hfs_find_data *fd)
struct hfs_btree *tree;
struct hfs_bnode *node, *parent;
int end_off, rec_off, data_off, size;
+ int src, dst, len;
tree = fd->tree;
node = fd->bnode;
@@ -191,7 +192,7 @@ again:
mark_inode_dirty(tree->inode);
}
hfs_bnode_dump(node);
- hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+ hfs_dbg("rec %d, len %d\n",
fd->record, fd->keylength + fd->entrylength);
if (!--node->num_recs) {
hfs_bnode_unlink(node);
@@ -208,10 +209,14 @@ again:
}
hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
- if (rec_off == end_off)
- goto skip;
size = fd->keylength + fd->entrylength;
+ if (rec_off == end_off) {
+ src = fd->keyoffset;
+ hfs_bnode_clear(node, src, size);
+ goto skip;
+ }
+
do {
data_off = hfs_bnode_read_u16(node, rec_off);
hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
@@ -219,9 +224,23 @@ again:
} while (rec_off >= end_off);
/* fill hole */
- hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
- data_off - fd->keyoffset - size);
+ dst = fd->keyoffset;
+ src = fd->keyoffset + size;
+ len = data_off - src;
+
+ hfs_bnode_move(node, dst, src, len);
+
+ src = dst + len;
+ len = data_off - src;
+
+ hfs_bnode_clear(node, src, len);
+
skip:
+ /*
+ * Remove the obsolete offset to free space.
+ */
+ hfs_bnode_write_u16(node, end_off, 0);
+
hfs_bnode_dump(node);
if (!fd->record)
hfs_brec_update_parent(fd);
@@ -242,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
if (IS_ERR(new_node))
return new_node;
hfs_bnode_get(node);
- hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
+ hfs_dbg("this %d, new %d, next %d\n",
node->this, new_node->this, node->next);
new_node->next = node->next;
new_node->prev = node->this;
@@ -378,7 +397,7 @@ again:
newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
else
fd->keylength = newkeylen = tree->max_key_len + 1;
- hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+ hfs_dbg("rec %d, keylength %d, newkeylen %d\n",
rec, fd->keylength, newkeylen);
rec_off = tree->node_size - (rec + 2) * 2;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index e86e1e235658..22e62fe7448b 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
u32 nidx;
u8 *data, byte, m;
- hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
+ hfs_dbg("node %u\n", node->this);
tree = node->tree;
nidx = node->this;
node = hfs_bnode_find(tree, 0);
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index d63880e7d9d6..caebabb6642f 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
int entry_size;
int err;
- hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+ hfs_dbg("name %s, cnid %u, i_nlink %d\n",
str->name, cnid, inode->i_nlink);
if (dir->i_size >= HFS_MAX_VALENCE)
return -ENOSPC;
@@ -211,6 +211,124 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
return hfs_brec_find(fd);
}
+static inline
+void hfs_set_next_unused_CNID(struct super_block *sb,
+ u32 deleted_cnid, u32 found_cnid)
+{
+ if (found_cnid < HFS_FIRSTUSER_CNID) {
+ atomic64_cmpxchg(&HFS_SB(sb)->next_id,
+ deleted_cnid + 1, HFS_FIRSTUSER_CNID);
+ } else {
+ atomic64_cmpxchg(&HFS_SB(sb)->next_id,
+ deleted_cnid + 1, found_cnid + 1);
+ }
+}
+
+/*
+ * hfs_correct_next_unused_CNID()
+ *
+ * Correct the next unused CNID of Catalog Tree.
+ */
+static
+int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid)
+{
+ struct hfs_btree *cat_tree;
+ struct hfs_bnode *node;
+ s64 leaf_head;
+ s64 leaf_tail;
+ s64 node_id;
+
+ hfs_dbg("cnid %u, next_id %lld\n",
+ cnid, atomic64_read(&HFS_SB(sb)->next_id));
+
+ if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) {
+ /* next ID should be unchanged */
+ return 0;
+ }
+
+ cat_tree = HFS_SB(sb)->cat_tree;
+ leaf_head = cat_tree->leaf_head;
+ leaf_tail = cat_tree->leaf_tail;
+
+ if (leaf_head > leaf_tail) {
+ pr_err("node is corrupted: leaf_head %lld, leaf_tail %lld\n",
+ leaf_head, leaf_tail);
+ return -ERANGE;
+ }
+
+ node = hfs_bnode_find(cat_tree, leaf_tail);
+ if (IS_ERR(node)) {
+ pr_err("fail to find leaf node: node ID %lld\n",
+ leaf_tail);
+ return -ENOENT;
+ }
+
+ node_id = leaf_tail;
+
+ do {
+ int i;
+
+ if (node_id != leaf_tail) {
+ node = hfs_bnode_find(cat_tree, node_id);
+ if (IS_ERR(node))
+ return -ENOENT;
+ }
+
+ hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n",
+ node_id, leaf_tail, leaf_head);
+
+ hfs_bnode_dump(node);
+
+ for (i = node->num_recs - 1; i >= 0; i--) {
+ hfs_cat_rec rec;
+ u16 off, len, keylen;
+ int entryoffset;
+ int entrylength;
+ u32 found_cnid;
+
+ len = hfs_brec_lenoff(node, i, &off);
+ keylen = hfs_brec_keylen(node, i);
+ if (keylen == 0) {
+ pr_err("fail to get the keylen: "
+ "node_id %lld, record index %d\n",
+ node_id, i);
+ return -EINVAL;
+ }
+
+ entryoffset = off + keylen;
+ entrylength = len - keylen;
+
+ if (entrylength > sizeof(rec)) {
+ pr_err("unexpected record length: "
+ "entrylength %d\n",
+ entrylength);
+ return -EINVAL;
+ }
+
+ hfs_bnode_read(node, &rec, entryoffset, entrylength);
+
+ if (rec.type == HFS_CDR_DIR) {
+ found_cnid = be32_to_cpu(rec.dir.DirID);
+ hfs_dbg("found_cnid %u\n", found_cnid);
+ hfs_set_next_unused_CNID(sb, cnid, found_cnid);
+ hfs_bnode_put(node);
+ return 0;
+ } else if (rec.type == HFS_CDR_FIL) {
+ found_cnid = be32_to_cpu(rec.file.FlNum);
+ hfs_dbg("found_cnid %u\n", found_cnid);
+ hfs_set_next_unused_CNID(sb, cnid, found_cnid);
+ hfs_bnode_put(node);
+ return 0;
+ }
+ }
+
+ hfs_bnode_put(node);
+
+ node_id = node->prev;
+ } while (node_id >= leaf_head);
+
+ return -ENOENT;
+}
/*
* hfs_cat_delete()
@@ -225,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
struct hfs_readdir_data *rd;
int res, type;
- hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+ hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid);
sb = dir->i_sb;
res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
if (res)
@@ -271,6 +389,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
dir->i_size--;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
+
+ res = hfs_correct_next_unused_CNID(sb, cnid);
+ if (res)
+ goto out;
+
res = 0;
out:
hfs_find_exit(&fd);
@@ -294,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
int entry_size, type;
int err;
- hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+ hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n",
cnid, src_dir->i_ino, src_name->name,
dst_dir->i_ino, dst_name->name);
sb = src_dir->i_sb;
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 580c62981dbd..a097908b269d 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent)
{
int i;
- hfs_dbg(EXTENT, " ");
+ hfs_dbg("extent: ");
for (i = 0; i < 3; i++)
- hfs_dbg_cont(EXTENT, " %u:%u",
- be16_to_cpu(extent[i].block),
- be16_to_cpu(extent[i].count));
- hfs_dbg_cont(EXTENT, "\n");
+ hfs_dbg(" block %u, count %u",
+ be16_to_cpu(extent[i].block),
+ be16_to_cpu(extent[i].count));
+ hfs_dbg("\n");
}
static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
@@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode)
goto out;
}
- hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+ hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len);
if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
if (!HFS_I(inode)->first_blocks) {
- hfs_dbg(EXTENT, "first extents\n");
+ hfs_dbg("first_extent: start %u, len %u\n",
+ start, len);
/* no extents yet */
HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
@@ -456,7 +457,7 @@ out:
return res;
insert_extent:
- hfs_dbg(EXTENT, "insert new extent\n");
+ hfs_dbg("insert new extent\n");
res = hfs_ext_write_extent(inode);
if (res)
goto out;
@@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode)
u32 size;
int res;
- hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n",
+ hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n",
inode->i_ino, (long long)HFS_I(inode)->phys_size,
inode->i_size);
if (inode->i_size > HFS_I(inode)->phys_size) {
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 7c5a7ecfa246..fff149af89da 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -9,12 +9,6 @@
#ifndef _LINUX_HFS_FS_H
#define _LINUX_HFS_FS_H
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/mutex.h>
@@ -24,35 +18,10 @@
#include <asm/byteorder.h>
#include <linux/uaccess.h>
+#include <linux/hfs_common.h>
#include "hfs.h"
-#define DBG_BNODE_REFS 0x00000001
-#define DBG_BNODE_MOD 0x00000002
-#define DBG_CAT_MOD 0x00000004
-#define DBG_INODE 0x00000008
-#define DBG_SUPER 0x00000010
-#define DBG_EXTENT 0x00000020
-#define DBG_BITMAP 0x00000040
-
-//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP)
-//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
-//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
-#define DBG_MASK (0)
-
-#define hfs_dbg(flg, fmt, ...) \
-do { \
- if (DBG_##flg & DBG_MASK) \
- printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
-} while (0)
-
-#define hfs_dbg_cont(flg, fmt, ...) \
-do { \
- if (DBG_##flg & DBG_MASK) \
- pr_cont(fmt, ##__VA_ARGS__); \
-} while (0)
-
-
/*
* struct hfs_inode_info
*
@@ -112,13 +81,13 @@ struct hfs_sb_info {
the extents b-tree */
struct hfs_btree *cat_tree; /* Information about
the catalog b-tree */
- u32 file_count; /* The number of
+ atomic64_t file_count; /* The number of
regular files in
the filesystem */
- u32 folder_count; /* The number of
+ atomic64_t folder_count; /* The number of
directories in the
filesystem */
- u32 next_id; /* The next available
+ atomic64_t next_id; /* The next available
file id number */
u32 clumpablks; /* The number of allocation
blocks to try to add when
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index bf4cb7e78396..9cd449913dc8 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -183,6 +183,10 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
{
struct super_block *sb = dir->i_sb;
struct inode *inode = new_inode(sb);
+ s64 next_id;
+ s64 file_count;
+ s64 folder_count;
+
if (!inode)
return NULL;
@@ -190,7 +194,9 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
spin_lock_init(&HFS_I(inode)->open_dir_lock);
hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
- inode->i_ino = HFS_SB(sb)->next_id++;
+ next_id = atomic64_inc_return(&HFS_SB(sb)->next_id);
+ BUG_ON(next_id > U32_MAX);
+ inode->i_ino = (u32)next_id;
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
@@ -202,7 +208,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
if (S_ISDIR(mode)) {
inode->i_size = 2;
- HFS_SB(sb)->folder_count++;
+ folder_count = atomic64_inc_return(&HFS_SB(sb)->folder_count);
+ BUG_ON(folder_count > U32_MAX);
if (dir->i_ino == HFS_ROOT_CNID)
HFS_SB(sb)->root_dirs++;
inode->i_op = &hfs_dir_inode_operations;
@@ -211,7 +218,8 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
} else if (S_ISREG(mode)) {
HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
- HFS_SB(sb)->file_count++;
+ file_count = atomic64_inc_return(&HFS_SB(sb)->file_count);
+ BUG_ON(file_count > U32_MAX);
if (dir->i_ino == HFS_ROOT_CNID)
HFS_SB(sb)->root_files++;
inode->i_op = &hfs_file_inode_operations;
@@ -241,16 +249,19 @@ void hfs_delete_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino);
+ hfs_dbg("ino %lu\n", inode->i_ino);
if (S_ISDIR(inode->i_mode)) {
- HFS_SB(sb)->folder_count--;
+ BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX);
+ atomic64_dec(&HFS_SB(sb)->folder_count);
if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
HFS_SB(sb)->root_dirs--;
set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
hfs_mark_mdb_dirty(sb);
return;
}
- HFS_SB(sb)->file_count--;
+
+ BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX);
+ atomic64_dec(&HFS_SB(sb)->file_count);
if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
HFS_SB(sb)->root_files--;
if (S_ISREG(inode->i_mode)) {
@@ -425,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_cat_rec rec;
int res;
- hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino);
+ hfs_dbg("ino %lu\n", inode->i_ino);
res = hfs_ext_write_extent(inode);
if (res)
return res;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8082eb01127c..53f3fae60217 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -150,11 +150,11 @@ int hfs_mdb_get(struct super_block *sb)
/* These parameters are read from and written to the MDB */
HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks);
- HFS_SB(sb)->next_id = be32_to_cpu(mdb->drNxtCNID);
+ atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID));
HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls);
HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs);
- HFS_SB(sb)->file_count = be32_to_cpu(mdb->drFilCnt);
- HFS_SB(sb)->folder_count = be32_to_cpu(mdb->drDirCnt);
+ atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt));
+ atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt));
/* TRY to get the alternate (backup) MDB. */
sect = part_start + part_size - 2;
@@ -172,7 +172,7 @@ int hfs_mdb_get(struct super_block *sb)
pr_warn("continuing without an alternate MDB\n");
}
- HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL);
+ HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL);
if (!HFS_SB(sb)->bitmap)
goto out;
@@ -273,11 +273,17 @@ void hfs_mdb_commit(struct super_block *sb)
/* These parameters may have been modified, so write them back */
mdb->drLsMod = hfs_mtime();
mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks);
- mdb->drNxtCNID = cpu_to_be32(HFS_SB(sb)->next_id);
+ BUG_ON(atomic64_read(&HFS_SB(sb)->next_id) > U32_MAX);
+ mdb->drNxtCNID =
+ cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id));
mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files);
mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs);
- mdb->drFilCnt = cpu_to_be32(HFS_SB(sb)->file_count);
- mdb->drDirCnt = cpu_to_be32(HFS_SB(sb)->folder_count);
+ BUG_ON(atomic64_read(&HFS_SB(sb)->file_count) > U32_MAX);
+ mdb->drFilCnt =
+ cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count));
+ BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX);
+ mdb->drDirCnt =
+ cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count));
/* write MDB to disk */
mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 388a318297ec..47f50fa555a4 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -319,6 +319,10 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
int silent = fc->sb_flags & SB_SILENT;
int res;
+ atomic64_set(&sbi->file_count, 0);
+ atomic64_set(&sbi->folder_count, 0);
+ atomic64_set(&sbi->next_id, 0);
+
/* load_nls_default does not fail */
if (sbi->nls_disk && !sbi->nls_io)
sbi->nls_io = load_nls_default();
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index eeebe80c6be4..ba26980cc503 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid,
{
int err = 0;
- hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
+ hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid);
if (!HFSPLUS_SB(sb)->attr_tree) {
pr_err("attributes file doesn't exist\n");
@@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode,
int entry_size;
int err;
- hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n",
+ hfs_dbg("name %s, ino %ld\n",
name ? name : NULL, inode->i_ino);
if (!HFSPLUS_SB(sb)->attr_tree) {
@@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name)
struct super_block *sb = inode->i_sb;
struct hfs_find_data fd;
- hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n",
+ hfs_dbg("name %s, ino %ld\n",
name ? name : NULL, inode->i_ino);
if (!HFSPLUS_SB(sb)->attr_tree) {
@@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
int err = 0;
struct hfs_find_data fd;
- hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid);
+ hfs_dbg("cnid %d\n", cnid);
if (!HFSPLUS_SB(dir->i_sb)->attr_tree) {
pr_err("attributes file doesn't exist\n");
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 901e83d65d20..afc9c89e8c6a 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -18,12 +18,12 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->tree = tree;
fd->bnode = NULL;
- ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
+ ptr = kzalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
fd->search_key = ptr;
fd->key = ptr + tree->max_key_len + 2;
- hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+ hfs_dbg("cnid %d, caller %ps\n",
tree->cnid, __builtin_return_address(0));
mutex_lock_nested(&tree->tree_lock,
hfsplus_btree_lock_class(tree));
@@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
{
hfs_bnode_put(fd->bnode);
kfree(fd->search_key);
- hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+ hfs_dbg("cnid %d, caller %ps\n",
fd->tree->cnid, __builtin_return_address(0));
mutex_unlock(&fd->tree->tree_lock);
fd->tree = NULL;
@@ -158,6 +158,12 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)
__be32 data;
int height, res;
+ fd->record = -1;
+ fd->keyoffset = -1;
+ fd->keylength = -1;
+ fd->entryoffset = -1;
+ fd->entrylength = -1;
+
tree = fd->tree;
if (fd->bnode)
hfs_bnode_put(fd->bnode);
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index bd8dcea85588..1b3af8c87cad 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
if (!len)
return size;
- hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
+ hfs_dbg("size %u, offset %u, len %u\n", size, offset, len);
mutex_lock(&sbi->alloc_mutex);
mapping = sbi->alloc_file->i_mapping;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
@@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
else
end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
}
- hfs_dbg(BITMAP, "bitmap full\n");
+ hfs_dbg("bitmap full\n");
start = size;
goto out;
found:
start = offset + (curr - pptr) * 32 + i;
if (start >= size) {
- hfs_dbg(BITMAP, "bitmap full\n");
+ hfs_dbg("bitmap full\n");
goto out;
}
/* do any partial u32 at the start */
@@ -155,7 +155,7 @@ done:
*max = offset + (curr - pptr) * 32 + i - start;
sbi->free_blocks -= *max;
hfsplus_mark_mdb_dirty(sb);
- hfs_dbg(BITMAP, "-> %u,%u\n", start, *max);
+ hfs_dbg("start %u, max %u\n", start, *max);
out:
mutex_unlock(&sbi->alloc_mutex);
return start;
@@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
if (!count)
return 0;
- hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count);
+ hfs_dbg("offset %u, count %u\n", offset, count);
/* are all of the bits in range? */
if ((offset + count) > sbi->total_blocks)
return -ENOENT;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 14f4995588ff..63e652ad1e0d 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -18,47 +18,6 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
-static inline
-bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
-{
- bool is_valid = off < node->tree->node_size;
-
- if (!is_valid) {
- pr_err("requested invalid offset: "
- "NODE: id %u, type %#x, height %u, "
- "node_size %u, offset %d\n",
- node->this, node->type, node->height,
- node->tree->node_size, off);
- }
-
- return is_valid;
-}
-
-static inline
-int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
-{
- unsigned int node_size;
-
- if (!is_bnode_offset_valid(node, off))
- return 0;
-
- node_size = node->tree->node_size;
-
- if ((off + len) > node_size) {
- int new_len = (int)node_size - off;
-
- pr_err("requested length has been corrected: "
- "NODE: id %u, type %#x, height %u, "
- "node_size %u, offset %d, "
- "requested_len %d, corrected_len %d\n",
- node->this, node->type, node->height,
- node->tree->node_size, off, len, new_len);
-
- return new_len;
- }
-
- return len;
-}
/* Copy a specified range of bytes from the raw data of a node */
void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
@@ -214,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
struct page **src_page, **dst_page;
int l;
- hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
if (!len)
return;
@@ -272,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
void *src_ptr, *dst_ptr;
int l;
- hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+ hfs_dbg("dst %u, src %u, len %u\n", dst, src, len);
if (!len)
return;
@@ -392,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
__be32 cnid;
int i, off, key_off;
- hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
+ hfs_dbg("node %d\n", node->this);
hfs_bnode_read(node, &desc, 0, sizeof(desc));
- hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
+ hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n",
be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
desc.type, desc.height, be16_to_cpu(desc.num_recs));
off = node->tree->node_size - 2;
for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
key_off = hfs_bnode_read_u16(node, off);
- hfs_dbg(BNODE_MOD, " %d", key_off);
+ hfs_dbg(" key_off %d", key_off);
if (i && node->type == HFS_NODE_INDEX) {
int tmp;
@@ -410,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node)
tmp = hfs_bnode_read_u16(node, key_off) + 2;
else
tmp = node->tree->max_key_len + 2;
- hfs_dbg_cont(BNODE_MOD, " (%d", tmp);
+ hfs_dbg(" (%d", tmp);
hfs_bnode_read(node, &cnid, key_off + tmp, 4);
- hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+ hfs_dbg(", cnid %d)", be32_to_cpu(cnid));
} else if (i && node->type == HFS_NODE_LEAF) {
int tmp;
tmp = hfs_bnode_read_u16(node, key_off);
- hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
+ hfs_dbg(" (%d)", tmp);
}
}
- hfs_dbg_cont(BNODE_MOD, "\n");
+ hfs_dbg("\n");
}
void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -456,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
/* move down? */
if (!node->prev && !node->next)
- hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n");
+ hfs_dbg("btree delete level\n");
if (!node->parent) {
tree->root = 0;
tree->depth = 0;
@@ -511,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
node->this = cnid;
set_bit(HFS_BNODE_NEW, &node->flags);
atomic_set(&node->refcnt, 1);
- hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+ hfs_dbg("cnid %d, node %d, refcnt 1\n",
node->tree->cnid, node->this);
init_waitqueue_head(&node->lock_wq);
spin_lock(&tree->hash_lock);
@@ -551,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
{
struct hfs_bnode **p;
- hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
+ hfs_dbg("cnid %d, node %d, refcnt %d\n",
node->tree->cnid, node->this, atomic_read(&node->refcnt));
for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
*p && *p != node; p = &(*p)->next_hash)
@@ -697,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node)
{
if (node) {
atomic_inc(&node->refcnt);
- hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+ hfs_dbg("cnid %d, node %d, refcnt %d\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
}
@@ -710,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
struct hfs_btree *tree = node->tree;
int i;
- hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+ hfs_dbg("cnid %d, node %d, refcnt %d\n",
node->tree->cnid, node->this,
atomic_read(&node->refcnt));
BUG_ON(!atomic_read(&node->refcnt));
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 1918544a7871..b4645102feec 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -92,7 +92,7 @@ again:
end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
end_off = hfs_bnode_read_u16(node, end_rec_off);
end_rec_off -= 2;
- hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+ hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n",
rec, size, end_off, end_rec_off);
if (size > end_rec_off - end_off) {
if (new_node)
@@ -193,7 +193,7 @@ again:
mark_inode_dirty(tree->inode);
}
hfs_bnode_dump(node);
- hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+ hfs_dbg("rec %d, len %d\n",
fd->record, fd->keylength + fd->entrylength);
if (!--node->num_recs) {
hfs_bnode_unlink(node);
@@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
if (IS_ERR(new_node))
return new_node;
hfs_bnode_get(node);
- hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
+ hfs_dbg("this %d - new %d - next %d\n",
node->this, new_node->this, node->next);
new_node->next = node->next;
new_node->prev = node->this;
@@ -383,7 +383,7 @@ again:
newkeylen = hfs_bnode_read_u16(node, 14) + 2;
else
fd->keylength = newkeylen = tree->max_key_len + 2;
- hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+ hfs_dbg("rec %d, keylength %d, newkeylen %d\n",
rec, fd->keylength, newkeylen);
rec_off = tree->node_size - (rec + 2) * 2;
@@ -395,7 +395,7 @@ again:
end_off = hfs_bnode_read_u16(parent, end_rec_off);
if (end_rec_off - end_off < diff) {
- hfs_dbg(BNODE_MOD, "splitting index node\n");
+ hfs_dbg("splitting index node\n");
fd->bnode = parent;
new_node = hfs_bnode_split(fd);
if (IS_ERR(new_node))
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 9e1732a2b92a..7cc5aea14572 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -393,6 +393,12 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
len = hfs_brec_lenoff(node, 2, &off16);
off = off16;
+ if (!is_bnode_offset_valid(node, off)) {
+ hfs_bnode_put(node);
+ return ERR_PTR(-EIO);
+ }
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
data = kmap_local_page(*pagep);
@@ -428,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
kunmap_local(data);
nidx = node->next;
if (!nidx) {
- hfs_dbg(BNODE_MOD, "create new bmap node\n");
+ hfs_dbg("create new bmap node\n");
next_node = hfs_bmap_new_bmap(node, idx);
} else
next_node = hfs_bnode_find(tree, nidx);
@@ -454,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
u32 nidx;
u8 *data, byte, m;
- hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
+ hfs_dbg("node %u\n", node->this);
BUG_ON(!node->this);
tree = node->tree;
nidx = node->this;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 1995bafee839..02c1eee4a4b8 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
int entry_size;
int err;
- hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+ hfs_dbg("name %s, cnid %u, i_nlink %d\n",
str->name, cnid, inode->i_nlink);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
@@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
int err, off;
u16 type;
- hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+ hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return err;
@@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid,
int entry_size, type;
int err;
- hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+ hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n",
cnid, src_dir->i_ino, src_name->name,
dst_dir->i_ino, dst_name->name);
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 876bbb80fb4d..1b3e27a0d5e0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
fd.entrylength);
type = be16_to_cpu(entry.type);
len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN;
- err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
+ err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len);
if (err)
goto out;
if (type == HFSPLUS_FOLDER) {
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index b1699b3c246a..8e886514d27f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
mutex_unlock(&hip->extents_lock);
done:
- hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n",
+ hfs_dbg("ino %lu, iblock %llu - dblock %u\n",
inode->i_ino, (long long)iblock, dblock);
mask = (1 << sbi->fs_shift) - 1;
@@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent)
{
int i;
- hfs_dbg(EXTENT, " ");
+ hfs_dbg("extent ");
for (i = 0; i < 8; i++)
- hfs_dbg_cont(EXTENT, " %u:%u",
- be32_to_cpu(extent[i].start_block),
- be32_to_cpu(extent[i].block_count));
- hfs_dbg_cont(EXTENT, "\n");
+ hfs_dbg(" start_block %u, block_count %u",
+ be32_to_cpu(extent[i].start_block),
+ be32_to_cpu(extent[i].block_count));
+ hfs_dbg("\n");
}
static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
@@ -359,8 +359,7 @@ found:
if (count <= block_nr) {
err = hfsplus_block_free(sb, start, count);
if (err) {
- pr_err("can't free extent\n");
- hfs_dbg(EXTENT, " start: %u count: %u\n",
+ pr_err("can't free extent: start %u, count %u\n",
start, count);
}
extent->block_count = 0;
@@ -370,8 +369,7 @@ found:
count -= block_nr;
err = hfsplus_block_free(sb, start + count, block_nr);
if (err) {
- pr_err("can't free extent\n");
- hfs_dbg(EXTENT, " start: %u count: %u\n",
+ pr_err("can't free extent: start %u, count %u\n",
start, count);
}
extent->block_count = cpu_to_be32(count);
@@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout)
goto out;
}
- hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+ hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len);
if (hip->alloc_blocks <= hip->first_blocks) {
if (!hip->first_blocks) {
- hfs_dbg(EXTENT, "first extents\n");
+ hfs_dbg("first_extent: start %u, len %u\n",
+ start, len);
/* no extents yet */
hip->first_extents[0].start_block = cpu_to_be32(start);
hip->first_extents[0].block_count = cpu_to_be32(len);
@@ -521,7 +520,7 @@ out:
return res;
insert_extent:
- hfs_dbg(EXTENT, "insert new extent\n");
+ hfs_dbg("insert new extent\n");
res = hfsplus_ext_write_extent_locked(inode);
if (res)
goto out;
@@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode)
u32 alloc_cnt, blk_cnt, start;
int res;
- hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n",
+ hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n",
inode->i_ino, (long long)hip->phys_size, inode->i_size);
if (inode->i_size > hip->phys_size) {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 96a5c24813dd..89e8b19c127b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -11,47 +11,14 @@
#ifndef _LINUX_HFSPLUS_FS_H
#define _LINUX_HFSPLUS_FS_H
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/fs_context.h>
+#include <linux/hfs_common.h>
#include "hfsplus_raw.h"
-#define DBG_BNODE_REFS 0x00000001
-#define DBG_BNODE_MOD 0x00000002
-#define DBG_CAT_MOD 0x00000004
-#define DBG_INODE 0x00000008
-#define DBG_SUPER 0x00000010
-#define DBG_EXTENT 0x00000020
-#define DBG_BITMAP 0x00000040
-#define DBG_ATTR_MOD 0x00000080
-
-#if 0
-#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
-#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
-#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
-#endif
-#define DBG_MASK (0)
-
-#define hfs_dbg(flg, fmt, ...) \
-do { \
- if (DBG_##flg & DBG_MASK) \
- printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
-} while (0)
-
-#define hfs_dbg_cont(flg, fmt, ...) \
-do { \
- if (DBG_##flg & DBG_MASK) \
- pr_cont(fmt, ##__VA_ARGS__); \
-} while (0)
-
/* Runtime config options */
#define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */
@@ -521,8 +488,12 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
const struct hfsplus_unistr *s2);
int hfsplus_strcmp(const struct hfsplus_unistr *s1,
const struct hfsplus_unistr *s2);
-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
- char *astr, int *len_p);
+int hfsplus_uni2asc_str(struct super_block *sb,
+ const struct hfsplus_unistr *ustr, char *astr,
+ int *len_p);
+int hfsplus_uni2asc_xattr_str(struct super_block *sb,
+ const struct hfsplus_attr_unistr *ustr,
+ char *astr, int *len_p);
int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
int max_unistr_len, const char *astr, int len);
int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
@@ -577,6 +548,48 @@ hfsplus_btree_lock_class(struct hfs_btree *tree)
return class;
}
+static inline
+bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
+{
+ bool is_valid = off < node->tree->node_size;
+
+ if (!is_valid) {
+ pr_err("requested invalid offset: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off);
+ }
+
+ return is_valid;
+}
+
+static inline
+int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
+{
+ unsigned int node_size;
+
+ if (!is_bnode_offset_valid(node, off))
+ return 0;
+
+ node_size = node->tree->node_size;
+
+ if ((off + len) > node_size) {
+ int new_len = (int)node_size - off;
+
+ pr_err("requested length has been corrected: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, "
+ "requested_len %d, corrected_len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len, new_len);
+
+ return new_len;
+ }
+
+ return len;
+}
+
/* compatibility */
#define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) }
#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 86351bdc8985..16bc4abc67e0 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -68,13 +68,26 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
if (!(inode->i_state & I_NEW))
return inode;
- INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
- spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock);
- mutex_init(&HFSPLUS_I(inode)->extents_lock);
- HFSPLUS_I(inode)->flags = 0;
+ atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+ HFSPLUS_I(inode)->first_blocks = 0;
+ HFSPLUS_I(inode)->clump_blocks = 0;
+ HFSPLUS_I(inode)->alloc_blocks = 0;
+ HFSPLUS_I(inode)->cached_start = U32_MAX;
+ HFSPLUS_I(inode)->cached_blocks = 0;
+ memset(HFSPLUS_I(inode)->first_extents, 0, sizeof(hfsplus_extent_rec));
+ memset(HFSPLUS_I(inode)->cached_extents, 0, sizeof(hfsplus_extent_rec));
HFSPLUS_I(inode)->extent_state = 0;
+ mutex_init(&HFSPLUS_I(inode)->extents_lock);
HFSPLUS_I(inode)->rsrc_inode = NULL;
- atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+ HFSPLUS_I(inode)->create_date = 0;
+ HFSPLUS_I(inode)->linkid = 0;
+ HFSPLUS_I(inode)->flags = 0;
+ HFSPLUS_I(inode)->fs_blocks = 0;
+ HFSPLUS_I(inode)->userflags = 0;
+ HFSPLUS_I(inode)->subfolders = 0;
+ INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+ spin_lock_init(&HFSPLUS_I(inode)->open_dir_lock);
+ HFSPLUS_I(inode)->phys_size = 0;
if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
inode->i_ino == HFSPLUS_ROOT_CNID) {
@@ -150,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode,
{
int err;
- hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+ hfs_dbg("ino %lu\n", inode->i_ino);
err = hfsplus_ext_write_extent(inode);
if (err)
@@ -165,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode,
static void hfsplus_evict_inode(struct inode *inode)
{
- hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+ hfs_dbg("ino %lu\n", inode->i_ino);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HFSPLUS_IS_RSRC(inode)) {
@@ -184,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
if (!wait)
return 0;
- hfs_dbg(SUPER, "hfsplus_sync_fs\n");
+ hfs_dbg("starting...\n");
/*
* Explicitly write out the special metadata inodes.
@@ -215,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
vhdr->folder_count = cpu_to_be32(sbi->folder_count);
vhdr->file_count = cpu_to_be32(sbi->file_count);
+ hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n",
+ sbi->free_blocks, sbi->next_cnid,
+ sbi->folder_count, sbi->file_count);
+
if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
write_backup = 1;
@@ -240,6 +257,8 @@ out:
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
blkdev_issue_flush(sb->s_bdev);
+ hfs_dbg("finished: err %d\n", error);
+
return error;
}
@@ -288,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
- hfs_dbg(SUPER, "hfsplus_put_super\n");
+ hfs_dbg("starting...\n");
cancel_delayed_work_sync(&sbi->sync_work);
@@ -310,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb)
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
call_rcu(&sbi->rcu, delayed_free);
+
+ hfs_dbg("finished\n");
}
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -524,7 +545,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) {
- err = -EINVAL;
+ err = -EIO;
goto out_put_root;
}
inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 36b6cf2a3abb..11e08a4a18b2 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -40,6 +40,18 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
p1 = s1->unicode;
p2 = s2->unicode;
+ if (len1 > HFSPLUS_MAX_STRLEN) {
+ len1 = HFSPLUS_MAX_STRLEN;
+ pr_err("invalid length %u has been corrected to %d\n",
+ be16_to_cpu(s1->length), len1);
+ }
+
+ if (len2 > HFSPLUS_MAX_STRLEN) {
+ len2 = HFSPLUS_MAX_STRLEN;
+ pr_err("invalid length %u has been corrected to %d\n",
+ be16_to_cpu(s2->length), len2);
+ }
+
while (1) {
c1 = c2 = 0;
@@ -74,6 +86,18 @@ int hfsplus_strcmp(const struct hfsplus_unistr *s1,
p1 = s1->unicode;
p2 = s2->unicode;
+ if (len1 > HFSPLUS_MAX_STRLEN) {
+ len1 = HFSPLUS_MAX_STRLEN;
+ pr_err("invalid length %u has been corrected to %d\n",
+ be16_to_cpu(s1->length), len1);
+ }
+
+ if (len2 > HFSPLUS_MAX_STRLEN) {
+ len2 = HFSPLUS_MAX_STRLEN;
+ pr_err("invalid length %u has been corrected to %d\n",
+ be16_to_cpu(s2->length), len2);
+ }
+
for (len = min(len1, len2); len > 0; len--) {
c1 = be16_to_cpu(*p1);
c2 = be16_to_cpu(*p2);
@@ -119,9 +143,8 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
return NULL;
}
-int hfsplus_uni2asc(struct super_block *sb,
- const struct hfsplus_unistr *ustr,
- char *astr, int *len_p)
+static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
+ int max_len, char *astr, int *len_p)
{
const hfsplus_unichr *ip;
struct nls_table *nls = HFSPLUS_SB(sb)->nls;
@@ -134,8 +157,8 @@ int hfsplus_uni2asc(struct super_block *sb,
ip = ustr->unicode;
ustrlen = be16_to_cpu(ustr->length);
- if (ustrlen > HFSPLUS_MAX_STRLEN) {
- ustrlen = HFSPLUS_MAX_STRLEN;
+ if (ustrlen > max_len) {
+ ustrlen = max_len;
pr_err("invalid length %u has been corrected to %d\n",
be16_to_cpu(ustr->length), ustrlen);
}
@@ -256,6 +279,21 @@ out:
return res;
}
+inline int hfsplus_uni2asc_str(struct super_block *sb,
+ const struct hfsplus_unistr *ustr, char *astr,
+ int *len_p)
+{
+ return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p);
+}
+
+inline int hfsplus_uni2asc_xattr_str(struct super_block *sb,
+ const struct hfsplus_attr_unistr *ustr,
+ char *astr, int *len_p)
+{
+ return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr,
+ HFSPLUS_ATTR_MAX_STRLEN, astr, len_p);
+}
+
/*
* Convert one or more ASCII characters into a single unicode character.
* Returns the number of ASCII characters corresponding to the unicode char.
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 18dc3d254d21..ece4d29c0ab9 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file,
u32 used_bmp_bytes;
u64 tmp;
- hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n",
+ hfs_dbg("clump %u, node_size %u\n",
clump_size, node_size);
/* The end of the node contains list of record offsets */
@@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb)
struct page *page;
int old_state = HFSPLUS_EMPTY_ATTR_TREE;
- hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID);
+ hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID);
check_attr_tree_state_again:
switch (atomic_read(&sbi->attr_tree_state)) {
@@ -735,9 +735,9 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto end_listxattr;
xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN;
- if (hfsplus_uni2asc(inode->i_sb,
- (const struct hfsplus_unistr *)&fd.key->attr.key_name,
- strbuf, &xattr_name_len)) {
+ if (hfsplus_uni2asc_xattr_str(inode->i_sb,
+ &fd.key->attr.key_name, strbuf,
+ &xattr_name_len)) {
pr_err("unicode conversion failed\n");
res = -EIO;
goto end_listxattr;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 01e516175bcd..1e1acf5775ab 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
static const struct super_operations hostfs_sbops = {
.alloc_inode = hostfs_alloc_inode,
.free_inode = hostfs_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = hostfs_evict_inode,
.statfs = hostfs_statfs,
.show_options = hostfs_show_options,
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index a59e8fa630db..34008442ee26 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i)
struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
struct inode *parent;
if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
- if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
+ if (hpfs_inode->i_rddir_off && !icount_read(i)) {
if (*hpfs_inode->i_rddir_off)
pr_err("write_inode: some position still there\n");
kfree(hpfs_inode->i_rddir_off);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 09d4baef29cf..be4be99304bc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
/*
* If folio is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) while holding
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the folio.
+ * unmapped in caller or hugetlb_vmdelete_list() skips
+ * unmapping it due to fail to grab lock. Unmap (again)
+ * while holding the fault mutex. The mutex will prevent
+ * faults until we finish removing the folio. Hold folio
+ * lock to guarantee no concurrent migration.
*/
+ folio_lock(folio);
if (unlikely(folio_mapped(folio)))
hugetlb_unmap_file_folio(h, mapping, folio, index);
- folio_lock(folio);
/*
* We must remove the folio from page cache before removing
* the region/ reserve map (hugetlb_unreserve_pages). In
diff --git a/fs/init.c b/fs/init.c
index eef5124885e3..07f592ccdba8 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
else if (!(S_ISBLK(mode) || S_ISCHR(mode)))
return -EINVAL;
- dentry = kern_path_create(AT_FDCWD, filename, &path, 0);
+ dentry = start_creating_path(AT_FDCWD, filename, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
if (!error)
error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode, new_decode_dev(dev));
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return error;
}
@@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname)
if (error)
return error;
- new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0);
+ new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out;
@@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname)
error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
new_dentry, NULL);
out_dput:
- done_path_create(&new_path, new_dentry);
+ end_creating_path(&new_path, new_dentry);
out:
path_put(&old_path);
return error;
@@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname)
struct path path;
int error;
- dentry = kern_path_create(AT_FDCWD, newname, &path, 0);
+ dentry = start_creating_path(AT_FDCWD, newname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
error = security_path_symlink(&path, dentry, oldname);
if (!error)
error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, oldname);
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return error;
}
@@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode)
struct path path;
int error;
- dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
+ dentry = start_creating_path(AT_FDCWD, pathname, &path,
+ LOOKUP_DIRECTORY);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
mode = mode_strip_umask(d_inode(path.dentry), mode);
@@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
if (IS_ERR(dentry))
error = PTR_ERR(dentry);
}
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 01ebdc40021e..ec9339024ac3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
{
if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
return;
- if (atomic_read(&inode->i_count))
+ if (icount_read(inode))
return;
if (!(inode->i_sb->s_flags & SB_ACTIVE))
return;
@@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
struct inode *inode, u32 bit)
{
- void *bit_address;
+ void *bit_address;
- bit_address = inode_state_wait_address(inode, bit);
- init_wait_var_entry(wqe, bit_address, 0);
- return __var_waitqueue(bit_address);
+ bit_address = inode_state_wait_address(inode, bit);
+ init_wait_var_entry(wqe, bit_address, 0);
+ return __var_waitqueue(bit_address);
}
EXPORT_SYMBOL(inode_bit_waitqueue);
@@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb)
again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (atomic_read(&inode->i_count))
+ if (icount_read(inode))
continue;
spin_lock(&inode->i_lock);
- if (atomic_read(&inode->i_count)) {
+ if (icount_read(inode)) {
spin_unlock(&inode->i_lock);
continue;
}
@@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* unreclaimable for a while. Remove them lazily here; iput,
* sync, or the last page cache deletion will requeue them.
*/
- if (atomic_read(&inode->i_count) ||
+ if (icount_read(inode) ||
(inode->i_state & ~I_REFERENCED) ||
!mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
@@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
+ might_sleep();
+
again:
spin_lock(&inode_hash_lock);
old = find_inode(inode->i_sb, head, test, data, true);
@@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode, *new;
+ might_sleep();
+
again:
inode = find_inode(sb, head, test, data, false);
if (inode) {
@@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+
+ might_sleep();
+
again:
inode = find_inode_fast(sb, head, ino, false);
if (inode) {
@@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct inode *inode;
+
+ might_sleep();
+
again:
inode = ilookup5_nowait(sb, hashval, test, data);
if (inode) {
@@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+
+ might_sleep();
+
again:
inode = find_inode_fast(sb, head, ino, false);
@@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode)
ino_t ino = inode->i_ino;
struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ might_sleep();
+
while (1) {
struct inode *old = NULL;
spin_lock(&inode_hash_lock);
@@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
{
struct inode *old;
+ might_sleep();
+
inode->i_state |= I_CREATING;
old = inode_insert5(inode, hashval, test, NULL, data);
@@ -1838,11 +1855,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
EXPORT_SYMBOL(insert_inode_locked4);
-int generic_delete_inode(struct inode *inode)
+int inode_just_drop(struct inode *inode)
{
return 1;
}
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL(inode_just_drop);
/*
* Called when we're dropping the last reference
@@ -1866,7 +1883,7 @@ static void iput_final(struct inode *inode)
if (op->drop_inode)
drop = op->drop_inode(inode);
else
- drop = generic_drop_inode(inode);
+ drop = inode_generic_drop(inode);
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
@@ -1908,20 +1925,45 @@ static void iput_final(struct inode *inode)
*/
void iput(struct inode *inode)
{
- if (!inode)
+ might_sleep();
+ if (unlikely(!inode))
return;
- BUG_ON(inode->i_state & I_CLEAR);
+
retry:
- if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
- if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
- atomic_inc(&inode->i_count);
- spin_unlock(&inode->i_lock);
- trace_writeback_lazytime_iput(inode);
- mark_inode_dirty_sync(inode);
- goto retry;
- }
- iput_final(inode);
+ lockdep_assert_not_held(&inode->i_lock);
+ VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode);
+ /*
+ * Note this assert is technically racy as if the count is bogusly
+ * equal to one, then two CPUs racing to further drop it can both
+ * conclude it's fine.
+ */
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode);
+
+ if (atomic_add_unless(&inode->i_count, -1, 1))
+ return;
+
+ if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) {
+ trace_writeback_lazytime_iput(inode);
+ mark_inode_dirty_sync(inode);
+ goto retry;
+ }
+
+ spin_lock(&inode->i_lock);
+ if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) {
+ spin_unlock(&inode->i_lock);
+ goto retry;
}
+
+ if (!atomic_dec_and_test(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ /*
+ * iput_final() drops ->i_lock, we can't assert on it as the inode may
+ * be deallocated by the time the call returns.
+ */
+ iput_final(inode);
}
EXPORT_SYMBOL(iput);
@@ -2189,7 +2231,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
return notify_change(idmap, dentry, &newattrs, NULL);
}
-int file_remove_privs_flags(struct file *file, unsigned int flags)
+static int file_remove_privs_flags(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
@@ -2214,7 +2256,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags)
inode_has_no_xattr(inode);
return error;
}
-EXPORT_SYMBOL_GPL(file_remove_privs_flags);
/**
* file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2519,21 +2560,28 @@ void __init inode_init(void)
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
- if (S_ISCHR(mode)) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFCHR:
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
- } else if (S_ISBLK(mode)) {
+ break;
+ case S_IFBLK:
if (IS_ENABLED(CONFIG_BLOCK))
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
- } else if (S_ISFIFO(mode))
+ break;
+ case S_IFIFO:
inode->i_fop = &pipefifo_fops;
- else if (S_ISSOCK(mode))
- ; /* leave it no_open_fops */
- else
+ break;
+ case S_IFSOCK:
+ /* leave it no_open_fops */
+ break;
+ default:
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
+ break;
+ }
}
EXPORT_SYMBOL(init_special_inode);
@@ -2911,10 +2959,18 @@ EXPORT_SYMBOL(mode_strip_sgid);
*
* TODO: add a proper inode dumping routine, this is a stub to get debug off the
* ground.
+ *
+ * TODO: handle getting to fs type with get_kernel_nofault()?
+ * See dump_mapping() above.
*/
void dump_inode(struct inode *inode, const char *reason)
{
- pr_warn("%s encountered for inode %px", reason, inode);
+ struct super_block *sb = inode->i_sb;
+
+ pr_warn("%s encountered for inode %px\n"
+ "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
+ reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
+ inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
}
EXPORT_SYMBOL(dump_inode);
diff --git a/fs/internal.h b/fs/internal.h
index 38e8aab27bbd..a33d18ee5b74 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
void pidfs_get_root(struct path *path);
+void nsfs_get_root(struct path *path);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0248cb8db2d3..1c152c2b1b67 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
out:
return error;
}
-EXPORT_SYMBOL(vfs_ioctl);
static int ioctl_fibmap(struct file *filp, int __user *p)
{
@@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file,
goto out;
}
- size = offsetof(struct file_dedupe_range, info[count]);
+ size = struct_size(same, info, count);
if (size > PAGE_SIZE) {
ret = -ENOMEM;
goto out;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fd827398afd2..8b847a1e27f1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
size_t size = i_size_read(iter->inode) - iomap->offset;
size_t offset = offset_in_folio(folio, iomap->offset);
+ if (WARN_ON_ONCE(!iomap->inline_data))
+ return -EIO;
+
if (folio_test_uptodate(folio))
return 0;
@@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
return true;
}
-static void iomap_write_end_inline(const struct iomap_iter *iter,
+static bool iomap_write_end_inline(const struct iomap_iter *iter,
struct folio *folio, loff_t pos, size_t copied)
{
const struct iomap *iomap = &iter->iomap;
@@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter,
WARN_ON_ONCE(!folio_test_uptodate(folio));
BUG_ON(!iomap_inline_data_valid(iomap));
+ if (WARN_ON_ONCE(!iomap->inline_data))
+ return false;
+
flush_dcache_folio(folio);
addr = kmap_local_folio(folio, pos);
memcpy(iomap_inline_data(iomap, pos), addr, copied);
kunmap_local(addr);
mark_inode_dirty(iter->inode);
+ return true;
}
/*
@@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
- if (srcmap->type == IOMAP_INLINE) {
- iomap_write_end_inline(iter, folio, pos, copied);
- return true;
- }
+ if (srcmap->type == IOMAP_INLINE)
+ return iomap_write_end_inline(iter, folio, pos, copied);
if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
size_t bh_written;
@@ -1396,6 +1401,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
+ trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset,
+ bytes);
+
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b84f6af2eb4c..46aa85af13dc 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
loff_t pos = iomi->pos;
u64 copied;
+ if (WARN_ON_ONCE(!inline_data))
+ return -EIO;
+
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 6ad66e6ba653..a61c1dae4742 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
+DEFINE_RANGE_EVENT(iomap_zero_iter);
#define IOMAP_TYPE_STRINGS \
{ IOMAP_HOLE, "HOLE" }, \
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index a6c692cac616..9adf36e6364b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
!list_empty(&of->list));
}
+/* Get active reference to kernfs node for an open file */
+static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
+{
+ /* Skip if file was already released */
+ if (unlikely(of->released))
+ return NULL;
+
+ if (!kernfs_get_active(of->kn))
+ return NULL;
+
+ return of;
+}
+
+static void kernfs_put_active_of(struct kernfs_open_file *of)
+{
+ return kernfs_put_active(of->kn);
+}
+
/**
* kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
*
@@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
if (ops->seq_stop)
ops->seq_stop(sf, v);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return ERR_PTR(-ENODEV);
ops = kernfs_ops(of->kn);
@@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
len = -ENODEV;
mutex_unlock(&of->mutex);
goto out_free;
@@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len < 0)
@@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
len = -ENODEV;
goto out_free;
@@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len > 0)
@@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
if (!of->vm_ops)
return;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return;
if (of->vm_ops->open)
of->vm_ops->open(vma);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
@@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = VM_FAULT_SIGBUS;
if (of->vm_ops->fault)
ret = of->vm_ops->fault(vmf);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = 0;
@@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
else
file_update_time(file);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
if (!of->vm_ops)
return -EINVAL;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return -EINVAL;
ret = -EINVAL;
if (of->vm_ops->access)
ret = of->vm_ops->access(vma, addr, buf, len, write);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&of->mutex);
rc = -ENODEV;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
goto out_unlock;
ops = kernfs_ops(of->kn);
@@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
}
vma->vm_ops = &kernfs_vm_ops;
out_put:
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
out_unlock:
mutex_unlock(&of->mutex);
@@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
__poll_t ret;
- if (!kernfs_get_active(kn))
+ if (!kernfs_get_active_of(of))
return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
if (kn->attr.ops->poll)
@@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
else
ret = kernfs_generic_poll(of, wait);
- kernfs_put_active(kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
return -ENODEV;
}
@@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
else
ret = generic_file_llseek(file, offset, whence);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
return ret;
}
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e384a69fbece..76eaf64b9d9e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
const struct super_operations kernfs_sops = {
.statfs = kernfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = kernfs_evict_inode,
.show_options = kernfs_sop_show_options,
diff --git a/fs/locks.c b/fs/locks.c
index 559f02aa4172..04a3f0e20724 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2328,8 +2328,8 @@ out:
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
- * flags need to be set.
+ * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags
+ * need to be set.
*
* Callers expecting ->lock() to return asynchronously will only use F_SETLK,
* not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df9d11479caf..32db676127a9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
inode->i_op = &minix_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
- } else
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
init_special_inode(inode, inode->i_mode, rdev);
+ } else {
+ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ make_bad_inode(inode);
+ }
}
/*
diff --git a/fs/mount.h b/fs/mount.h
index 97737051a8b9..79c85639a7ba 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -17,11 +17,7 @@ struct mnt_namespace {
};
struct user_namespace *user_ns;
struct ucounts *ucounts;
- u64 seq; /* Sequence number to prevent loops */
- union {
- wait_queue_head_t poll;
- struct rcu_head mnt_ns_rcu;
- };
+ wait_queue_head_t poll;
u64 seq_origin; /* Sequence number of origin mount namespace */
u64 event;
#ifdef CONFIG_FSNOTIFY
@@ -30,8 +26,6 @@ struct mnt_namespace {
#endif
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
- struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
- struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
@@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry)
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
}
extern seqlock_t mount_lock;
@@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
- return ns->seq == 0;
+ return ns->ns.ns_id == 0;
}
static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namei.c b/fs/namei.c
index cd43ff89fbaa..507ca0d7878d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags
dentry->d_inode)
return -EISDIR;
+ /* No need to trigger automounts if mountpoint crossing is disabled. */
+ if (lookup_flags & LOOKUP_NO_XDEV)
+ return -EXDEV;
+
if (count && (*count)++ >= MAXSYMLINKS)
return -ELOOP;
@@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
/* Allow the filesystem to manage the transit without i_rwsem
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) {
+ if (lookup_flags & LOOKUP_NO_XDEV) {
+ ret = -EXDEV;
+ break;
+ }
ret = path->dentry->d_op->d_manage(path, false);
flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
@@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
// here we know it's positive
flags = path->dentry->d_flags;
need_mntput = true;
+ if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
+ ret = -EXDEV;
+ break;
+ }
continue;
}
}
@@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
- if (jumped) {
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- ret = -EXDEV;
- else
- nd->state |= ND_JUMPED;
- }
+ if (jumped)
+ nd->state |= ND_JUMPED;
if (unlikely(ret)) {
dput(path->dentry);
if (path->mnt != nd->path.mnt)
@@ -1827,6 +1835,20 @@ static struct dentry *lookup_slow(const struct qstr *name,
return res;
}
+static struct dentry *lookup_slow_killable(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
+{
+ struct inode *inode = dir->d_inode;
+ struct dentry *res;
+
+ if (inode_lock_shared_killable(inode))
+ return ERR_PTR(-EINTR);
+ res = __lookup_slow(name, dir, flags);
+ inode_unlock_shared(inode);
+ return res;
+}
+
static inline int may_lookup(struct mnt_idmap *idmap,
struct nameidata *restrict nd)
{
@@ -2744,7 +2766,8 @@ static int filename_parentat(int dfd, struct filename *name,
}
/* does lookup, returns the object with parent locked */
-static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
+static struct dentry *__start_removing_path(int dfd, struct filename *name,
+ struct path *path)
{
struct path parent_path __free(path_put) = {};
struct dentry *d;
@@ -2756,18 +2779,42 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct
return ERR_PTR(error);
if (unlikely(type != LAST_NORM))
return ERR_PTR(-EINVAL);
+ /* don't fail immediately if it's r/o, at least try to report other errors */
+ error = mnt_want_write(parent_path.mnt);
inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
- if (IS_ERR(d)) {
- inode_unlock(parent_path.dentry->d_inode);
- return d;
- }
+ if (IS_ERR(d))
+ goto unlock;
+ if (error)
+ goto fail;
path->dentry = no_free_ptr(parent_path.dentry);
path->mnt = no_free_ptr(parent_path.mnt);
return d;
+
+fail:
+ dput(d);
+ d = ERR_PTR(error);
+unlock:
+ inode_unlock(parent_path.dentry->d_inode);
+ if (!error)
+ mnt_drop_write(parent_path.mnt);
+ return d;
}
-struct dentry *kern_path_locked_negative(const char *name, struct path *path)
+/**
+ * kern_path_parent: lookup path returning parent and target
+ * @name: path name
+ * @path: path to store parent in
+ *
+ * The path @name should end with a normal component, not "." or ".." or "/".
+ * A lookup is performed and if successful the parent information
+ * is store in @parent and the dentry is returned.
+ *
+ * The dentry maybe negative, the parent will be positive.
+ *
+ * Returns: dentry or error.
+ */
+struct dentry *kern_path_parent(const char *name, struct path *path)
{
struct path parent_path __free(path_put) = {};
struct filename *filename __free(putname) = getname_kernel(name);
@@ -2780,35 +2827,35 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path)
return ERR_PTR(error);
if (unlikely(type != LAST_NORM))
return ERR_PTR(-EINVAL);
- inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE);
- if (IS_ERR(d)) {
- inode_unlock(parent_path.dentry->d_inode);
+
+ d = lookup_noperm_unlocked(&last, parent_path.dentry);
+ if (IS_ERR(d))
return d;
- }
path->dentry = no_free_ptr(parent_path.dentry);
path->mnt = no_free_ptr(parent_path.mnt);
return d;
}
-struct dentry *kern_path_locked(const char *name, struct path *path)
+struct dentry *start_removing_path(const char *name, struct path *path)
{
struct filename *filename = getname_kernel(name);
- struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
+ struct dentry *res = __start_removing_path(AT_FDCWD, filename, path);
putname(filename);
return res;
}
-struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
+struct dentry *start_removing_user_path_at(int dfd,
+ const char __user *name,
+ struct path *path)
{
struct filename *filename = getname(name);
- struct dentry *res = __kern_path_locked(dfd, filename, path);
+ struct dentry *res = __start_removing_path(dfd, filename, path);
putname(filename);
return res;
}
-EXPORT_SYMBOL(user_path_locked_at);
+EXPORT_SYMBOL(start_removing_user_path_at);
int kern_path(const char *name, unsigned int flags, struct path *path)
{
@@ -3011,6 +3058,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
EXPORT_SYMBOL(lookup_one_unlocked);
/**
+ * lookup_one_positive_killable - lookup single pathname component
+ * @idmap: idmap of the mount the lookup is performed from
+ * @name: qstr olding pathname component to lookup
+ * @base: base directory to lookup from
+ *
+ * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
+ * known positive or ERR_PTR(). This is what most of the users want.
+ *
+ * Note that pinned negative with unlocked parent _can_ become positive at any
+ * time, so callers of lookup_one_unlocked() need to be very careful; pinned
+ * positives have >d_inode stable, so this one avoids such problems.
+ *
+ * This can be used for in-kernel filesystem clients such as file servers.
+ *
+ * It should be called without the parent i_rwsem held, and will take
+ * the i_rwsem itself if necessary. If a fatal signal is pending or
+ * delivered, it will return %-EINTR if the lock is needed.
+ */
+struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
+ struct qstr *name,
+ struct dentry *base)
+{
+ int err;
+ struct dentry *ret;
+
+ err = lookup_one_common(idmap, name, base);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = lookup_dcache(name, base, 0);
+ if (!ret)
+ ret = lookup_slow_killable(name, base, 0);
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_positive_killable);
+
+/**
* lookup_one_positive_unlocked - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
* @name: qstr holding pathname component to lookup
@@ -4114,7 +4202,6 @@ static struct dentry *filename_create(int dfd, struct filename *name,
unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
int type;
- int err2;
int error;
error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
@@ -4129,7 +4216,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
goto out;
/* don't fail immediately if it's r/o, at least try to report other errors */
- err2 = mnt_want_write(path->mnt);
+ error = mnt_want_write(path->mnt);
/*
* Do the final lookup. Suppress 'create' if there is a trailing
* '/', and a directory wasn't requested.
@@ -4142,25 +4229,24 @@ static struct dentry *filename_create(int dfd, struct filename *name,
if (IS_ERR(dentry))
goto unlock;
- if (unlikely(err2)) {
- error = err2;
+ if (unlikely(error))
goto fail;
- }
+
return dentry;
fail:
dput(dentry);
dentry = ERR_PTR(error);
unlock:
inode_unlock(path->dentry->d_inode);
- if (!err2)
+ if (!error)
mnt_drop_write(path->mnt);
out:
path_put(path);
return dentry;
}
-struct dentry *kern_path_create(int dfd, const char *pathname,
- struct path *path, unsigned int lookup_flags)
+struct dentry *start_creating_path(int dfd, const char *pathname,
+ struct path *path, unsigned int lookup_flags)
{
struct filename *filename = getname_kernel(pathname);
struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
@@ -4168,9 +4254,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
putname(filename);
return res;
}
-EXPORT_SYMBOL(kern_path_create);
+EXPORT_SYMBOL(start_creating_path);
-void done_path_create(struct path *path, struct dentry *dentry)
+void end_creating_path(struct path *path, struct dentry *dentry)
{
if (!IS_ERR(dentry))
dput(dentry);
@@ -4178,10 +4264,11 @@ void done_path_create(struct path *path, struct dentry *dentry)
mnt_drop_write(path->mnt);
path_put(path);
}
-EXPORT_SYMBOL(done_path_create);
+EXPORT_SYMBOL(end_creating_path);
-inline struct dentry *user_path_create(int dfd, const char __user *pathname,
- struct path *path, unsigned int lookup_flags)
+inline struct dentry *start_creating_user_path(
+ int dfd, const char __user *pathname,
+ struct path *path, unsigned int lookup_flags)
{
struct filename *filename = getname(pathname);
struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
@@ -4189,7 +4276,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
putname(filename);
return res;
}
-EXPORT_SYMBOL(user_path_create);
+EXPORT_SYMBOL(start_creating_user_path);
/**
* vfs_mknod - create device node or file
@@ -4297,7 +4384,7 @@ retry:
break;
}
out2:
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4401,7 +4488,7 @@ retry:
if (IS_ERR(dentry))
error = PTR_ERR(dentry);
}
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4755,7 +4842,7 @@ retry:
if (!error)
error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, from->name);
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4828,7 +4915,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
return -EPERM;
/*
* Updating the link count will likely cause i_uid and i_gid to
- * be writen back improperly if their true value is unknown to
+ * be written back improperly if their true value is unknown to
* the vfs.
*/
if (HAS_UNMAPPED_ID(idmap, inode))
@@ -4924,7 +5011,7 @@ retry:
error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
new_dentry, &delegated_inode);
out_dput:
- done_path_create(&new_path, new_dentry);
+ end_creating_path(&new_path, new_dentry);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error) {
@@ -5024,20 +5111,20 @@ int vfs_rename(struct renamedata *rd)
if (source == target)
return 0;
- error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
+ error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir);
if (error)
return error;
if (!target) {
- error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
+ error = may_create(rd->mnt_idmap, new_dir, new_dentry);
} else {
new_is_dir = d_is_dir(new_dentry);
if (!(flags & RENAME_EXCHANGE))
- error = may_delete(rd->new_mnt_idmap, new_dir,
+ error = may_delete(rd->mnt_idmap, new_dir,
new_dentry, is_dir);
else
- error = may_delete(rd->new_mnt_idmap, new_dir,
+ error = may_delete(rd->mnt_idmap, new_dir,
new_dentry, new_is_dir);
}
if (error)
@@ -5052,13 +5139,13 @@ int vfs_rename(struct renamedata *rd)
*/
if (new_dir != old_dir) {
if (is_dir) {
- error = inode_permission(rd->old_mnt_idmap, source,
+ error = inode_permission(rd->mnt_idmap, source,
MAY_WRITE);
if (error)
return error;
}
if ((flags & RENAME_EXCHANGE) && new_is_dir) {
- error = inode_permission(rd->new_mnt_idmap, target,
+ error = inode_permission(rd->mnt_idmap, target,
MAY_WRITE);
if (error)
return error;
@@ -5126,7 +5213,7 @@ int vfs_rename(struct renamedata *rd)
if (error)
goto out;
}
- error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
+ error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry,
new_dir, new_dentry, flags);
if (error)
goto out;
@@ -5269,10 +5356,9 @@ retry_deleg:
rd.old_parent = old_path.dentry;
rd.old_dentry = old_dentry;
- rd.old_mnt_idmap = mnt_idmap(old_path.mnt);
+ rd.mnt_idmap = mnt_idmap(old_path.mnt);
rd.new_parent = new_path.dentry;
rd.new_dentry = new_dentry;
- rd.new_mnt_idmap = mnt_idmap(new_path.mnt);
rd.delegated_inode = &delegated_inode;
rd.flags = flags;
error = vfs_rename(&rd);
diff --git a/fs/namespace.c b/fs/namespace.c
index ae6d1312b184..dc01b14c58cd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -33,6 +33,7 @@
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
+#include <linux/nstree.h>
#include "pnode.h"
#include "internal.h"
@@ -65,6 +66,15 @@ static int __init set_mphash_entries(char *str)
}
__setup("mphash_entries=", set_mphash_entries);
+static char * __initdata initramfs_options;
+static int __init initramfs_options_setup(char *str)
+{
+ initramfs_options = str;
+ return 1;
+}
+
+__setup("initramfs_options=", initramfs_options_setup);
+
static u64 event;
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);
@@ -80,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
-static DEFINE_SEQLOCK(mnt_ns_tree_lock);
#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
-static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
-static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
enum mount_kattr_flags_t {
MOUNT_KATTR_RECURSE = (1 << 0),
@@ -119,59 +126,18 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
+ struct ns_common *ns;
+
if (!node)
return NULL;
- return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
-}
-
-static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
-{
- struct mnt_namespace *ns_a = node_to_mnt_ns(a);
- struct mnt_namespace *ns_b = node_to_mnt_ns(b);
- u64 seq_a = ns_a->seq;
- u64 seq_b = ns_b->seq;
-
- if (seq_a < seq_b)
- return -1;
- if (seq_a > seq_b)
- return 1;
- return 0;
-}
-
-static inline void mnt_ns_tree_write_lock(void)
-{
- write_seqlock(&mnt_ns_tree_lock);
-}
-
-static inline void mnt_ns_tree_write_unlock(void)
-{
- write_sequnlock(&mnt_ns_tree_lock);
-}
-
-static void mnt_ns_tree_add(struct mnt_namespace *ns)
-{
- struct rb_node *node, *prev;
-
- mnt_ns_tree_write_lock();
- node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
- /*
- * If there's no previous entry simply add it after the
- * head and if there is add it after the previous entry.
- */
- prev = rb_prev(&ns->mnt_ns_tree_node);
- if (!prev)
- list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
- else
- list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
- mnt_ns_tree_write_unlock();
-
- WARN_ON_ONCE(node);
+ ns = rb_entry(node, struct ns_common, ns_tree_node);
+ return container_of(ns, struct mnt_namespace, ns);
}
static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
- if (refcount_dec_and_test(&ns->passive)) {
+ if (ns && refcount_dec_and_test(&ns->passive)) {
fsnotify_mntns_delete(ns);
put_user_ns(ns->user_ns);
kfree(ns);
@@ -181,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
- mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+ mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
}
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
- if (!is_anon_ns(ns)) {
- mnt_ns_tree_write_lock();
- rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
- list_bidir_del_rcu(&ns->mnt_ns_list);
- mnt_ns_tree_write_unlock();
- }
+ if (ns_tree_active(ns))
+ ns_tree_remove(ns);
- call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
-}
-
-static int mnt_ns_find(const void *key, const struct rb_node *node)
-{
- const u64 mnt_ns_id = *(u64 *)key;
- const struct mnt_namespace *ns = node_to_mnt_ns(node);
-
- if (mnt_ns_id < ns->seq)
- return -1;
- if (mnt_ns_id > ns->seq)
- return 1;
- return 0;
+ call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu);
}
/*
@@ -225,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node)
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
- struct mnt_namespace *ns;
- struct rb_node *node;
- unsigned int seq;
+ struct mnt_namespace *mnt_ns;
+ struct ns_common *ns;
guard(rcu)();
- do {
- seq = read_seqbegin(&mnt_ns_tree_lock);
- node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
- if (node)
- break;
- } while (read_seqretry(&mnt_ns_tree_lock, seq));
-
- if (!node)
+ ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS);
+ if (!ns)
return NULL;
/*
* The last reference count is put with RCU delay so we can
* unconditonally acquire a reference here.
*/
- ns = node_to_mnt_ns(node);
- refcount_inc(&ns->passive);
- return ns;
+ mnt_ns = container_of(ns, struct mnt_namespace, ns);
+ refcount_inc(&mnt_ns->passive);
+ return mnt_ns;
}
static inline void lock_mount_hash(void)
@@ -1017,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt)
return false;
seq = mnt->mnt_ns->seq_origin;
- return !seq || (seq == current->nsproxy->mnt_ns->seq);
+ return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id);
}
/*
@@ -2152,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
+ struct ns_common *ns;
+
guard(rcu)();
for (;;) {
- struct list_head *list;
-
- if (previous)
- list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
- else
- list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
- if (list_is_head(list, &mnt_ns_list))
- return ERR_PTR(-ENOENT);
+ ns = ns_tree_adjoined_rcu(mntns, previous);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
- mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
+ mntns = to_mnt_ns(ns);
/*
* The last passive reference count is put with RCU
@@ -2179,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
* the mount namespace and it might already be on its
* deathbed.
*/
- if (!refcount_inc_not_zero(&mntns->ns.count))
+ if (!ns_ref_get(mntns))
continue;
return mntns;
@@ -2204,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry)
if (!mnt_ns)
return false;
- return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+ return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
}
struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
@@ -2455,7 +2395,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
return ERR_PTR(-EINVAL);
}
- if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (__has_locked_children(old_mnt, path->dentry))
@@ -3080,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
if (is_anon_ns(src_mnt_ns))
ns->seq_origin = src_mnt_ns->seq_origin;
else
- ns->seq_origin = src_mnt_ns->seq;
+ ns->seq_origin = src_mnt_ns->ns.ns_id;
}
mnt = __do_loopback(path, recursive);
@@ -3289,7 +3229,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
* If you've mounted a non-root directory somewhere and want to do remount
* on it - tough luck.
*/
-static int do_remount(struct path *path, int ms_flags, int sb_flags,
+static int do_remount(struct path *path, int sb_flags,
int mnt_flags, void *data)
{
int err;
@@ -3727,8 +3667,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
int error;
error = security_sb_kern_mount(sb);
- if (!error && mount_too_revealing(sb, &mnt_flags))
+ if (!error && mount_too_revealing(sb, &mnt_flags)) {
+ errorfcp(fc, "VFS", "Mount too revealing");
error = -EPERM;
+ }
if (unlikely(error)) {
fc_drop_locked(fc);
@@ -4112,7 +4054,7 @@ int path_mount(const char *dev_name, struct path *path,
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
return do_reconfigure_mnt(path, mnt_flags);
if (flags & MS_REMOUNT)
- return do_remount(path, flags, sb_flags, mnt_flags, data_page);
+ return do_remount(path, sb_flags, mnt_flags, data_page);
if (flags & MS_BIND)
return do_loopback(path, dev_name, flags & MS_REC);
if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
@@ -4151,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
static void free_mnt_ns(struct mnt_namespace *ns)
{
if (!is_anon_ns(ns))
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
dec_mnt_namespaces(ns->ucounts);
mnt_ns_tree_remove(ns);
}
-/*
- * Assign a sequence number so we can detect when we attempt to bind
- * mount a reference to an older mount namespace into the current
- * mount namespace, preventing reference counting loops. A 64bit
- * number incrementing at 10Ghz will take 12,427 years to wrap which
- * is effectively never, so we can ignore the possibility.
- */
-static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
@@ -4180,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
- if (!anon) {
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- dec_mnt_namespaces(ucounts);
- return ERR_PTR(ret);
- }
+
+ if (anon)
+ ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
+ else
+ ret = ns_common_init(new_ns);
+ if (ret) {
+ kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
+ return ERR_PTR(ret);
}
- new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
- refcount_set(&new_ns->ns.count, 1);
+ ns_tree_gen_id(&new_ns->ns);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
- INIT_LIST_HEAD(&new_ns->mnt_ns_list);
- RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
@@ -4203,7 +4134,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
}
__latent_entropy
-struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
@@ -4234,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
- ns_free_inum(&new_ns->ns);
+ ns_common_free(ns);
dec_mnt_namespaces(new_ns->ucounts);
mnt_ns_release(new_ns);
return ERR_CAST(new);
@@ -4281,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
if (pwdmnt)
mntput(pwdmnt);
- mnt_ns_tree_add(new_ns);
+ ns_tree_add_raw(new_ns);
return new_ns;
}
@@ -4444,7 +4375,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
ret = -EPERM;
if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
- pr_warn("VFS: Mount too revealing\n");
+ errorfcp(fc, "VFS", "Mount too revealing");
goto err_unlock;
}
@@ -5007,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
return -EINVAL;
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWUSER)
+ if (ns->ns_type != CLONE_NEWUSER)
return -EINVAL;
/*
@@ -5400,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
- s->sm.mnt_ns_id = ns->seq;
+ s->sm.mnt_ns_id = ns->ns.ns_id;
}
static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
@@ -5711,7 +5642,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
- struct path root __free(path_put) = {};
struct mount *m;
int err;
@@ -5723,7 +5653,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!s->mnt)
return -ENOENT;
- err = grab_requested_root(ns, &root);
+ err = grab_requested_root(ns, &s->root);
if (err)
return err;
@@ -5732,7 +5662,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
* mounts to show users.
*/
m = real_mount(s->mnt);
- if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -5740,8 +5670,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (err)
return err;
- s->root = root;
-
/*
* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
* can change concurrently as we only hold the read-side of the
@@ -5910,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return ERR_PTR(-EINVAL);
mnt_ns = to_mnt_ns(ns);
@@ -5963,28 +5891,40 @@ retry:
if (!ret)
ret = copy_statmount_to_user(ks);
kvfree(ks->seq.buf);
+ path_put(&ks->root);
if (retry_statmount(ret, &seq_size))
goto retry;
return ret;
}
-static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
- u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
- bool reverse)
+struct klistmount {
+ u64 last_mnt_id;
+ u64 mnt_parent_id;
+ u64 *kmnt_ids;
+ u32 nr_mnt_ids;
+ struct mnt_namespace *ns;
+ struct path root;
+};
+
+static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
- struct path root __free(path_put) = {};
+ struct mnt_namespace *ns = kls->ns;
+ u64 mnt_parent_id = kls->mnt_parent_id;
+ u64 last_mnt_id = kls->last_mnt_id;
+ u64 *mnt_ids = kls->kmnt_ids;
+ size_t nr_mnt_ids = kls->nr_mnt_ids;
struct path orig;
struct mount *r, *first;
ssize_t ret;
rwsem_assert_held(&namespace_sem);
- ret = grab_requested_root(ns, &root);
+ ret = grab_requested_root(ns, &kls->root);
if (ret)
return ret;
if (mnt_parent_id == LSMT_ROOT) {
- orig = root;
+ orig = kls->root;
} else {
orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
if (!orig.mnt)
@@ -5996,7 +5936,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -6029,14 +5969,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
return ret;
}
+static void __free_klistmount_free(const struct klistmount *kls)
+{
+ path_put(&kls->root);
+ kvfree(kls->kmnt_ids);
+ mnt_ns_release(kls->ns);
+}
+
+static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
+ size_t nr_mnt_ids)
+{
+
+ u64 last_mnt_id = kreq->param;
+
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
+
+ kls->last_mnt_id = last_mnt_id;
+
+ kls->nr_mnt_ids = nr_mnt_ids;
+ kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
+ GFP_KERNEL_ACCOUNT);
+ if (!kls->kmnt_ids)
+ return -ENOMEM;
+
+ kls->ns = grab_requested_mnt_ns(kreq);
+ if (!kls->ns)
+ return -ENOENT;
+
+ kls->mnt_parent_id = kreq->mnt_id;
+ return 0;
+}
+
SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
- u64 *kmnt_ids __free(kvfree) = NULL;
+ struct klistmount kls __free(klistmount_free) = {};
const size_t maxcount = 1000000;
- struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
- u64 last_mnt_id;
ssize_t ret;
if (flags & ~LISTMOUNT_REVERSE)
@@ -6057,22 +6028,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
- last_mnt_id = kreq.param;
- /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
- if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
- return -EINVAL;
-
- kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
- GFP_KERNEL_ACCOUNT);
- if (!kmnt_ids)
- return -ENOMEM;
-
- ns = grab_requested_mnt_ns(&kreq);
- if (!ns)
- return -ENOENT;
+ ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
+ if (ret)
+ return ret;
- if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
- !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;
/*
@@ -6080,39 +6041,43 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
* listmount() doesn't care about any mount properties.
*/
scoped_guard(rwsem_read, &namespace_sem)
- ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
- nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+ ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
if (ret <= 0)
return ret;
- if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+ if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
return -EFAULT;
return ret;
}
+struct mnt_namespace init_mnt_ns = {
+ .ns.inum = ns_init_inum(&init_mnt_ns),
+ .ns.ops = &mntns_operations,
+ .user_ns = &init_user_ns,
+ .ns.__ns_ref = REFCOUNT_INIT(1),
+ .ns.ns_type = ns_common_type(&init_mnt_ns),
+ .passive = REFCOUNT_INIT(1),
+ .mounts = RB_ROOT,
+ .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
+};
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mount *m;
- struct mnt_namespace *ns;
struct path root;
- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
+ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = alloc_mnt_ns(&init_user_ns, true);
- if (IS_ERR(ns))
- panic("Can't allocate initial namespace");
- ns->seq = atomic64_inc_return(&mnt_ns_seq);
- ns->ns.inum = PROC_MNT_INIT_INO;
m = real_mount(mnt);
- ns->root = m;
- ns->nr_mounts = 1;
- mnt_add_to_ns(ns, m);
- init_task.nsproxy->mnt_ns = ns;
- get_mnt_ns(ns);
+ init_mnt_ns.root = m;
+ init_mnt_ns.nr_mounts = 1;
+ mnt_add_to_ns(&init_mnt_ns, m);
+ init_task.nsproxy->mnt_ns = &init_mnt_ns;
+ get_mnt_ns(&init_mnt_ns);
root.mnt = mnt;
root.dentry = mnt->mnt_root;
@@ -6120,7 +6085,7 @@ static void __init init_mount_tree(void)
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
- mnt_ns_tree_add(ns);
+ ns_tree_add(&init_mnt_ns);
}
void __init mnt_init(void)
@@ -6160,7 +6125,7 @@ void __init mnt_init(void)
void put_mnt_ns(struct mnt_namespace *ns)
{
- if (!refcount_dec_and_test(&ns->ns.count))
+ if (!ns_ref_put(ns))
return;
namespace_lock();
emptied_ns = ns;
@@ -6409,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns)
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
- .type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 18b3dc74c70e..37ab6f28b5ad 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl)
return netfs_put_request(rreq, netfs_rreq_trace_put_return);
cleanup_free:
- return netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+ return netfs_put_failed_request(rreq);
}
EXPORT_SYMBOL(netfs_readahead);
@@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+ netfs_put_failed_request(rreq);
alloc_error:
folio_unlock(folio);
return ret;
@@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+ netfs_put_failed_request(rreq);
alloc_error:
folio_unlock(folio);
return ret;
@@ -699,7 +699,7 @@ have_folio_no_wait:
return 0;
error_put:
- netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+ netfs_put_failed_request(rreq);
error:
if (folio) {
folio_unlock(folio);
@@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
return ret < 0 ? ret : 0;
error_put:
- netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+ netfs_put_failed_request(rreq);
error:
_leave(" = %d", ret);
return ret;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index f27ea5099a68..09394ac2c180 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_put(folio);
ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
if (ret < 0)
- goto error_folio_unlock;
+ goto out;
continue;
copied:
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index a05e13472baf..a498ee8d6674 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
return -EIO;
}
@@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
if (user_backed_iter(iter)) {
ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0);
if (ret < 0)
- goto out;
+ goto error_put;
rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec;
rreq->direct_bv_count = ret;
rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
@@ -238,6 +239,10 @@ out:
if (ret > 0)
orig_count -= ret;
return ret;
+
+error_put:
+ netfs_put_failed_request(rreq);
+ return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index a16660ab7f83..a9d1c3b2c084 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
if (n < 0) {
ret = n;
- goto out;
+ goto error_put;
}
wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
wreq->direct_bv_count = n;
@@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
out:
netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
+
+error_put:
+ netfs_put_failed_request(wreq);
+ return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index d4f16fefd965..4319611f5354 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
void netfs_clear_subrequests(struct netfs_io_request *rreq);
void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
+void netfs_put_failed_request(struct netfs_io_request *rreq);
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
static inline void netfs_see_request(struct netfs_io_request *rreq,
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 20748bcfbf59..486166460e17 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq)
{
if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
!test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
- queue_work(system_unbound_wq, &rreq->work);
+ queue_work(system_dfl_wq, &rreq->work);
} else {
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
wake_up(&rreq->waitq);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e8c99738b5bb..b8c4918d3dcd 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu)
netfs_stat_d(&netfs_n_rh_rreq);
}
-static void netfs_free_request(struct work_struct *work)
+static void netfs_deinit_request(struct netfs_io_request *rreq)
{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, cleanup_work);
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
@@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work)
if (atomic_dec_and_test(&ictx->io_count))
wake_up_var(&ictx->io_count);
+}
+
+static void netfs_free_request(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, cleanup_work);
+
+ netfs_deinit_request(rreq);
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
@@ -163,11 +169,29 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
dead = __refcount_dec_and_test(&rreq->ref, &r);
trace_netfs_rreq_ref(debug_id, r - 1, what);
if (dead)
- WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work));
+ WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work));
}
}
/*
+ * Free a request (synchronously) that was just allocated but has
+ * failed before it could be submitted.
+ */
+void netfs_put_failed_request(struct netfs_io_request *rreq)
+{
+ int r = refcount_read(&rreq->ref);
+
+ /* new requests have two references (see
+ * netfs_alloc_request(), and this function is only allowed on
+ * new request objects
+ */
+ WARN_ON_ONCE(r != 2);
+
+ trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed);
+ netfs_free_request(&rreq->cleanup_work);
+}
+
+/*
* Allocate and partially initialise an I/O request structure.
*/
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index 8097bc069c1d..a1489aa29f78 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
return creq;
cancel_put:
- netfs_put_request(creq, netfs_rreq_trace_put_return);
+ netfs_put_failed_request(creq);
cancel:
rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index fa622a6cd56d..5c0dc4efc792 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite
return ret;
cleanup_free:
- netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+ netfs_put_failed_request(rreq);
return ret;
}
EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 0584cba1a043..dd8743bc8d7f 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
return wreq;
nomem:
- wreq->error = -ENOMEM;
- netfs_put_request(wreq, netfs_rreq_trace_put_failed);
+ netfs_put_failed_request(wreq);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8fb4a950dd55..4e3dcc157a83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (fsinfo->xattr_support)
server->caps |= NFS_CAP_XATTR;
+ else
+ server->caps &= ~NFS_CAP_XATTR;
#endif
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86e36c630f09..8059ece82468 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -28,6 +28,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/gfp.h>
+#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/compaction.h>
@@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL_GPL(nfs_file_fsync);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to)
+{
+ struct folio *folio;
+
+ if (from >= to)
+ return;
+
+ folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+
+ if (folio_test_uptodate(folio)) {
+ loff_t fpos = folio_pos(folio);
+ size_t offset = from - fpos;
+ size_t end = folio_size(folio);
+
+ if (to - fpos < end)
+ end = to - fpos;
+ folio_zero_segment(folio, offset, end);
+ trace_nfs_size_truncate_folio(mapping->host, to);
+ }
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(nfs_truncate_last_folio);
+
/*
* Decide whether a read/modify/write cycle may be more efficient
* then a modify/write/read cycle when writing to a page in the
@@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb,
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
+ nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
fgp |= fgf_set_order(len);
start:
@@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
folio->index, offset, length);
- if (offset != 0 || length < folio_size(folio))
- return;
/* Cancel any unstarted writes on this page */
- nfs_wb_folio_cancel(inode, folio);
+ if (offset != 0 || length < folio_size(folio))
+ nfs_wb_folio(inode, folio);
+ else
+ nfs_wb_folio_cancel(inode, folio);
folio_wait_private_2(folio); /* [DEPRECATED] */
trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length);
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 8dc921d83538..9edb5f9b0c4e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
struct pnfs_layout_segment *l2)
{
const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
- const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2);
u32 i;
if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
@@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ // reinitialize the error state in case if this is the last iteration
+ ds = ERR_PTR(-EINVAL);
continue;
+ }
*best_idx = idx;
break;
@@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
- if (ds)
+ if (!IS_ERR(ds))
return ds;
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}
@@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
best_idx);
- if (ds || !pgio->pg_mirror_idx)
+ if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
}
@@ -868,7 +871,7 @@ retry:
req->wb_nio = 0;
ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
- if (!ds) {
+ if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
pnfs_generic_pg_cleanup(pgio);
@@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ struct nfs4_pnfs_ds *ds;
- if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
- ff_layout_send_layouterror(hdr->lseg);
- else
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ else
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr, new_idx);
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 338ef77ae423..9bdaf7f38bed 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
int nfs_drop_inode(struct inode *inode)
{
- return NFS_STALE(inode) || generic_drop_inode(inode);
+ return NFS_STALE(inode) || inode_generic_drop(inode);
}
EXPORT_SYMBOL_GPL(nfs_drop_inode);
@@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode),
nfs_display_fhandle_hash(fh),
- atomic_read(&inode->i_count));
+ icount_read(inode));
out:
return inode;
@@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
+ loff_t oldsize = i_size_read(inode);
int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (error)
return error;
- if (attr->ia_size == i_size_read(inode))
+ if (attr->ia_size == oldsize)
attr->ia_valid &= ~ATTR_SIZE;
}
@@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
trace_nfs_setattr_enter(inode);
/* Write all dirty data */
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ nfs_file_block_o_direct(NFS_I(inode));
nfs_sync_inode(inode);
+ }
fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL) {
@@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
- if (error == 0)
+ if (error == 0) {
+ if (attr->ia_valid & ATTR_SIZE)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ attr->ia_size);
error = nfs_refresh_inode(inode, fattr);
+ }
nfs_free_fattr(fattr);
out:
trace_nfs_setattr_exit(inode, error);
@@ -2229,7 +2236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
nfs_display_fhandle_hash(NFS_FH(inode)),
- atomic_read(&inode->i_count), fattr->valid);
+ icount_read(inode), fattr->valid);
if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
/* Only a mounted-on-fileid? Just exit */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74d712b58423..c0a44f389f8f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
@@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
}
+/* Must be called with exclusively locked inode->i_rwsem */
+static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(&nfsi->vfs_inode);
+ }
+}
+
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 3388faf2acb9..d275b0a250bf 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -14,15 +14,6 @@
#include "internal.h"
-/* Call with exclusively locked inode->i_rwsem */
-static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
-{
- if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
- clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
- inode_dio_wait(inode);
- }
-}
-
/**
* nfs_start_io_read - declare the file is being used for buffered reads
* @inode: file inode
@@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (err)
return err;
- nfs_block_o_direct(nfsi, inode);
+ nfs_file_block_o_direct(nfsi);
downgrade_write(&inode->i_rwsem);
return 0;
@@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (!err)
- nfs_block_o_direct(NFS_I(inode), inode);
+ nfs_file_block_o_direct(NFS_I(inode));
return err;
}
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index bd5fca285899..97abf62f109d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp)
return;
}
- if (nfs_client_is_local(clp)) {
- /* If already enabled, disable and re-enable */
- nfs_localio_disable_client(clp);
- }
+ if (nfs_client_is_local(clp))
+ return;
if (!nfs_uuid_begin(&clp->cl_uuid))
return;
@@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
case -ENOMEM:
case -ENXIO:
case -ENOENT:
- /* Revalidate localio, will disable if unsupported */
+ /* Revalidate localio */
+ nfs_localio_disable_client(clp);
nfs_local_probe(clp);
}
}
@@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work)
nfs_local_iter_init(&iter, iocb, READ);
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+
+ revert_creds(save_cred);
+
if (status != -EIOCBQUEUED) {
nfs_local_read_done(iocb, status);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
}
static int
@@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work)
file_start_write(filp);
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
file_end_write(filp);
+
+ revert_creds(save_cred);
+ current->flags = old_flags;
+
if (status != -EIOCBQUEUED) {
nfs_local_write_done(iocb, status);
nfs_local_vfs_getattr(iocb);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
- current->flags = old_flags;
}
static int
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7f1ec9c67ff2..f9a3a1fbf44c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
num *= HZ;
*((int *)kp->arg) = num;
if (!list_empty(&nfs_automount_list))
- mod_delayed_work(system_wq, &nfs_automount_task, num);
+ mod_delayed_work(system_percpu_wq, &nfs_automount_task, num);
} else {
*((int *)kp->arg) = -1*HZ;
cancel_delayed_work(&nfs_automount_task);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 01c01f45358b..6a0b5871ba3b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
exception.inode = inode;
exception.state = lock->open_context->state;
+ nfs_file_block_o_direct(NFS_I(inode));
err = nfs_sync_inode(inode);
if (err)
goto out;
@@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
@@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == -EOPNOTSUPP)
+
+ if (err == 0)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
+ else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
NFS_CAP_ZERO_RANGE);
@@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
@@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == 0)
+ if (err == 0) {
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
truncate_pagecache_range(inode, offset, (offset + len) -1);
- if (err == -EOPNOTSUPP)
+ } else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
inode_unlock(inode);
@@ -354,22 +363,27 @@ out:
/**
* nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
- * @inode: pointer to destination inode
+ * @file: pointer to destination file
* @pos: destination offset
* @len: copy length
+ * @oldsize: length of the file prior to clone/copy
*
* Punch a hole in the inode page cache, so that the NFS client will
* know to retrieve new data.
* Update the file size if necessary, and then mark the inode as having
* invalid cached values for change attribute, ctime, mtime and space used.
*/
-static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len,
+ loff_t oldsize)
{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
loff_t newsize = pos + len;
loff_t end = newsize - 1;
- WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+ nfs_truncate_last_folio(mapping, oldsize, pos);
+ WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT));
spin_lock(&inode->i_lock);
if (newsize > i_size_read(inode))
@@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
struct nfs_server *src_server = NFS_SERVER(src_inode);
loff_t pos_src = args->src_pos;
loff_t pos_dst = args->dst_pos;
+ loff_t oldsize_dst = i_size_read(dst_inode);
size_t count = args->count;
ssize_t status;
@@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
+ nfs_file_block_o_direct(NFS_I(dst_inode));
status = nfs_sync_inode(dst_inode);
if (status)
return status;
@@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
goto out;
}
- nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+ nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst);
nfs_invalidate_atime(src_inode);
status = res->write_res.count;
out:
@@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct nfs42_clone_res res = {
.server = server,
};
+ loff_t oldsize_dst = i_size_read(dst_inode);
int status;
msg->rpc_argp = &args;
@@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
/* a zero-length count means clone to EOF in src */
if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
- nfs42_copy_dest_done(dst_inode, dst_offset, count);
+ nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1d6b5f4230c9..c9a0d1e420c6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
lock_two_nondirectories(src_inode, dst_inode);
/* flush all pending writes on both src and dst so that server
* has the latest data */
+ nfs_file_block_o_direct(NFS_I(src_inode));
ret = nfs_sync_inode(src_inode);
if (ret)
goto out_unlock;
+ nfs_file_block_o_direct(NFS_I(dst_inode));
ret = nfs_sync_inode(dst_inode);
if (ret)
goto out_unlock;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7d2b67e06cc3..ce61253efd45 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
res.attr_bitmask[2];
}
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
- server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
- NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+ server->caps &=
+ ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS |
+ NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS |
+ NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME);
server->fattr_valid = NFS_ATTR_FATTR_V4;
if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
@@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
};
int err;
- nfs_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index db3811af0796..18ae614e5a6c 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
timeout = 5 * HZ;
dprintk("%s: requeueing work. Lease period = %ld\n",
__func__, (timeout + HZ - 1) / HZ);
- mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
+ mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout);
set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
spin_unlock(&clp->cl_lock);
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 96b1323318c2..627115179795 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class,
TP_ARGS(inode, new_size))
DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio);
DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
DEFINE_NFS_UPDATE_SIZE_EVENT(update);
DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8b7c04737967..647c53d1418a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error)
}
/*
- * nfs_page_group_search_locked
- * @head - head request of page group
- * @page_offset - offset into page
+ * nfs_page_covers_folio
+ * @req: struct nfs_page
*
- * Search page group with head @head to find a request that contains the
- * page offset @page_offset.
- *
- * Returns a pointer to the first matching nfs request, or NULL if no
- * match is found.
- *
- * Must be called with the page group lock held
- */
-static struct nfs_page *
-nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
-{
- struct nfs_page *req;
-
- req = head;
- do {
- if (page_offset >= req->wb_pgbase &&
- page_offset < (req->wb_pgbase + req->wb_bytes))
- return req;
-
- req = req->wb_this_page;
- } while (req != head);
-
- return NULL;
-}
-
-/*
- * nfs_page_group_covers_page
- * @head - head request of page group
- *
- * Return true if the page group with head @head covers the whole page,
- * returns false otherwise
+ * Return true if the request covers the whole folio.
+ * Note that the caller should ensure all subrequests have been joined
*/
static bool nfs_page_group_covers_page(struct nfs_page *req)
{
unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
- struct nfs_page *tmp;
- unsigned int pos = 0;
-
- nfs_page_group_lock(req);
- for (;;) {
- tmp = nfs_page_group_search_locked(req->wb_head, pos);
- if (!tmp)
- break;
- pos = tmp->wb_pgbase + tmp->wb_bytes;
- }
-
- nfs_page_group_unlock(req);
- return pos >= len;
+ return req->wb_pgbase == 0 && req->wb_bytes == len;
}
/* We can set the PG_uptodate flag if we see that a write request
@@ -2045,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
* release it */
nfs_inode_remove_request(req);
nfs_unlock_and_release_request(req);
+ folio_cancel_dirty(folio);
}
return ret;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 732abf6b92a5..85ca663c052c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -113,7 +113,7 @@ static void
nfsd_file_schedule_laundrette(void)
{
if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
- queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette,
+ queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette,
NFSD_LAUNDRETTE_DELAY);
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index edf050766e57..aa4a95713a48 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1951,10 +1951,9 @@ retry:
goto out_dput_old;
} else {
struct renamedata rd = {
- .old_mnt_idmap = &nop_mnt_idmap,
+ .mnt_idmap = &nop_mnt_idmap,
.old_parent = fdentry,
.old_dentry = odentry,
- .new_mnt_idmap = &nop_mnt_idmap,
.new_parent = tdentry,
.new_dentry = ndentry,
};
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 14868a3dd592..bc52afbfc5c7 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
************************************************************************/
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
@@ -1087,7 +1087,7 @@ static const char features_readme_str[] =
"(1) revision\n\tshow current revision of NILFS file system driver.\n";
static ssize_t nilfs_feature_README_show(struct kobject *kobj,
- struct attribute *attr,
+ struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, features_readme_str);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 78a87a016928..d370cd5cce3f 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups {
struct completion sg_segments_kobj_unregister;
};
-#define NILFS_COMMON_ATTR_STRUCT(name) \
+#define NILFS_KOBJ_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
struct attribute attr; \
- ssize_t (*show)(struct kobject *, struct attribute *, \
+ ssize_t (*show)(struct kobject *, struct kobj_attribute *, \
char *); \
- ssize_t (*store)(struct kobject *, struct attribute *, \
+ ssize_t (*store)(struct kobject *, struct kobj_attribute *, \
const char *, size_t); \
}
-NILFS_COMMON_ATTR_STRUCT(feature);
+NILFS_KOBJ_ATTR_STRUCT(feature);
#define NILFS_DEV_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 079b868552c2..46bfc543f946 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
* removed all zero refcount inodes, in any case. Test to
* be sure.
*/
- if (!atomic_read(&inode->i_count)) {
+ if (!icount_read(inode)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 798340db69d7..55a03bb05aa1 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn->destroy_next = connector_destroy_list;
connector_destroy_list = conn;
spin_unlock(&destroy_lock);
- queue_work(system_unbound_wq, &connector_reaper_work);
+ queue_work(system_dfl_wq, &connector_reaper_work);
}
/*
* Note that we didn't update flags telling whether inode cares about
@@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
+ queue_delayed_work(system_dfl_wq, &reaper_work,
FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 59aa801347a7..e7fd8a790aaa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -13,12 +13,26 @@
#include <linux/nsfs.h>
#include <linux/uaccess.h>
#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <linux/exportfs.h>
+#include <linux/nstree.h>
+#include <net/net_namespace.h>
#include "mount.h"
#include "internal.h"
static struct vfsmount *nsfs_mnt;
+static struct path nsfs_root_path = {};
+
+void nsfs_get_root(struct path *path)
+{
+ *path = nsfs_root_path;
+ path_get(path);
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg);
static const struct file_operations ns_file_operations = {
@@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
* the size value will be set to the size the kernel knows about.
*/
kinfo->size = min(usize, sizeof(*kinfo));
- kinfo->mnt_ns_id = mnt_ns->seq;
+ kinfo->mnt_ns_id = mnt_ns->ns.ns_id;
kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
/* Subtract the root mount of the mount namespace. */
if (kinfo->nr_mounts)
@@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd)
case NS_GET_TGID_FROM_PIDNS:
case NS_GET_PID_IN_PIDNS:
case NS_GET_TGID_IN_PIDNS:
- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ case NS_GET_ID:
+ return true;
}
/* Extensible ioctls require some extra handling. */
switch (_IOC_NR(cmd)) {
case _IOC_NR(NS_MNT_GET_INFO):
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0);
case _IOC_NR(NS_MNT_GET_NEXT):
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0);
case _IOC_NR(NS_MNT_GET_PREV):
- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0);
}
return false;
@@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
return -EINVAL;
return open_related_ns(ns, ns->ops->get_parent);
case NS_GET_NSTYPE:
- return ns->ops->type;
+ return ns->ns_type;
case NS_GET_OWNER_UID:
- if (ns->ops->type != CLONE_NEWUSER)
+ if (ns->ns_type != CLONE_NEWUSER)
return -EINVAL;
user_ns = container_of(ns, struct user_namespace, ns);
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
- case NS_GET_MNTNS_ID: {
- __u64 __user *idp;
- __u64 id;
-
- if (ns->ops->type != CLONE_NEWNS)
- return -EINVAL;
-
- mnt_ns = container_of(ns, struct mnt_namespace, ns);
- idp = (__u64 __user *)arg;
- id = mnt_ns->seq;
- return put_user(id, idp);
- }
case NS_GET_PID_FROM_PIDNS:
fallthrough;
case NS_GET_TGID_FROM_PIDNS:
@@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
case NS_GET_PID_IN_PIDNS:
fallthrough;
case NS_GET_TGID_IN_PIDNS: {
- if (ns->ops->type != CLONE_NEWPID)
+ if (ns->ns_type != CLONE_NEWPID)
return -EINVAL;
ret = -ESRCH;
@@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
ret = -ESRCH;
return ret;
}
+ case NS_GET_MNTNS_ID:
+ if (ns->ns_type != CLONE_NEWNS)
+ return -EINVAL;
+ fallthrough;
+ case NS_GET_ID: {
+ __u64 __user *idp;
+ __u64 id;
+
+ idp = (__u64 __user *)arg;
+ id = ns->ns_id;
+ return put_user(id, idp);
+ }
}
/* extensible ioctls */
@@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
size_t usize = _IOC_SIZE(ioctl);
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return -EINVAL;
if (!uinfo)
@@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct file *f __free(fput) = NULL;
size_t usize = _IOC_SIZE(ioctl);
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return -EINVAL;
if (usize < MNT_NS_INFO_SIZE_VER0)
@@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = {
.put_data = nsfs_put_data,
};
+#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32))
+#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32))
+
+static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+ struct ns_common *ns = inode->i_private;
+ int len = *max_len;
+
+ if (parent)
+ return FILEID_INVALID;
+
+ if (len < NSFS_FID_SIZE_U32_VER0) {
+ *max_len = NSFS_FID_SIZE_U32_LATEST;
+ return FILEID_INVALID;
+ } else if (len > NSFS_FID_SIZE_U32_LATEST) {
+ *max_len = NSFS_FID_SIZE_U32_LATEST;
+ }
+
+ fid->ns_id = ns->ns_id;
+ fid->ns_type = ns->ns_type;
+ fid->ns_inum = inode->i_ino;
+ return FILEID_NSFS;
+}
+
+static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct path path __free(path_put) = {};
+ struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+ struct user_namespace *owning_ns = NULL;
+ struct ns_common *ns;
+ int ret;
+
+ if (fh_len < NSFS_FID_SIZE_U32_VER0)
+ return NULL;
+
+ /* Check that any trailing bytes are zero. */
+ if ((fh_len > NSFS_FID_SIZE_U32_LATEST) &&
+ memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0,
+ fh_len - NSFS_FID_SIZE_U32_LATEST))
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_NSFS:
+ break;
+ default:
+ return NULL;
+ }
+
+ scoped_guard(rcu) {
+ ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
+ if (!ns)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
+ VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
+ VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
+
+ if (!__ns_ref_get(ns))
+ return NULL;
+ }
+
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ if (!current_in_namespace(to_cg_ns(ns)))
+ owning_ns = to_cg_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ if (!current_in_namespace(to_ipc_ns(ns)))
+ owning_ns = to_ipc_ns(ns)->user_ns;
+ break;
+#endif
+ case CLONE_NEWNS:
+ if (!current_in_namespace(to_mnt_ns(ns)))
+ owning_ns = to_mnt_ns(ns)->user_ns;
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ if (!current_in_namespace(to_net_ns(ns)))
+ owning_ns = to_net_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ if (!current_in_namespace(to_pid_ns(ns))) {
+ owning_ns = to_pid_ns(ns)->user_ns;
+ } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
+ ns->ops->put(ns);
+ return ERR_PTR(-EPERM);
+ }
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ if (!current_in_namespace(to_time_ns(ns)))
+ owning_ns = to_time_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ if (!current_in_namespace(to_user_ns(ns)))
+ owning_ns = to_user_ns(ns);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ if (!current_in_namespace(to_uts_ns(ns)))
+ owning_ns = to_uts_ns(ns)->user_ns;
+ break;
+#endif
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
+ ns->ops->put(ns);
+ return ERR_PTR(-EPERM);
+ }
+
+ /* path_from_stashed() unconditionally consumes the reference. */
+ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return no_free_ptr(path.dentry);
+}
+
+static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
+ unsigned int oflags)
+{
+ /* nsfs_fh_to_dentry() performs all permission checks. */
+ return 0;
+}
+
+static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+{
+ return file_open_root(path, "", oflags, 0);
+}
+
+static const struct export_operations nsfs_export_operations = {
+ .encode_fh = nsfs_encode_fh,
+ .fh_to_dentry = nsfs_fh_to_dentry,
+ .open = nsfs_export_open,
+ .permission = nsfs_export_permission,
+};
+
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
ctx->ops = &nsfs_ops;
+ ctx->eops = &nsfs_export_operations;
ctx->dops = &ns_dentry_operations;
fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
@@ -438,4 +607,6 @@ void __init nsfs_init(void)
if (IS_ERR(nsfs_mnt))
panic("can't set nsfs up\n");
nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
+ nsfs_root_path.mnt = nsfs_mnt;
+ nsfs_root_path.dentry = nsfs_mnt->mnt_root;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2018501b2249..2347a50f079b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
dlm_debug_init(dlm);
snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
- dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
+ dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 5130ec44e5e1..cccaa1d6fbba 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = {
.alloc_inode = dlmfs_alloc_inode,
.free_inode = dlmfs_free_inode,
.evict_inode = dlmfs_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static const struct inode_operations dlmfs_file_inode_operations = {
@@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void)
}
cleanup_inode = 1;
- user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
+ user_dlm_worker = alloc_workqueue("user_dlm",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 930150ed5db1..ef147e8b3271 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -706,6 +706,8 @@ out:
* it not only handles the fiemap for inlined files, but also deals
* with the fast symlink, cause they have no difference for extent
* mapping per se.
+ *
+ * Must be called with ip_alloc_sem semaphore held.
*/
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
@@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
u64 phys;
u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ lockdep_assert_held_read(&oi->ip_alloc_sem);
di = (struct ocfs2_dinode *)di_bh->b_data;
if (ocfs2_inode_is_fast_symlink(inode))
@@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
phys += offsetof(struct ocfs2_dinode,
id2.i_data.id_data);
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0)
return ret;
}
@@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
-
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
len_bytes, fe_flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret)
break;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 14bf440ea4df..6c4f78f473fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode)
* the journal is flushed before journal shutdown. Thus it is safe to
* have inodes get cleaned up after journal shutdown.
*/
+ if (!osb->journal)
+ return;
+
jbd2_journal_release_jbd_inode(osb->journal->j_journal,
&oi->ip_jinode);
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8f732742b26e..267b50e8e42e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
- new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+ new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry)) {
mlog_errno(error);
@@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode,
d_inode(new_path.dentry),
new_dentry, preserve);
out_dput:
- done_path_create(&new_path, new_dentry);
+ end_creating_path(&new_path, new_dentry);
out:
path_put(&old_path);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 0f045e45fa0c..765105f1ff8a 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = {
static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
{
version_unlock(conn);
- dlm_release_lockspace(conn->cc_lockspace, 2);
+ dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL);
conn->cc_lockspace = NULL;
ocfs2_live_connection_drop(conn->cc_private);
conn->cc_private = NULL;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index f3da840758e7..b46100a4f529 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = {
.free_inode = orangefs_free_inode,
.destroy_inode = orangefs_destroy_inode,
.write_inode = orangefs_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.statfs = orangefs_statfs,
.show_options = orangefs_show_options,
};
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index bb0d7ded8e76..4f84abaa0d68 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
{
int err;
struct renamedata rd = {
- .old_mnt_idmap = ovl_upper_mnt_idmap(ofs),
+ .mnt_idmap = ovl_upper_mnt_idmap(ofs),
.old_parent = olddir,
.old_dentry = olddentry,
- .new_mnt_idmap = ovl_upper_mnt_idmap(ofs),
.new_parent = newdir,
.new_dentry = newdentry,
.flags = flags,
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index b65cdfce31ce..15cb06fa0c9a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
- int err;
+ int err = 0;
struct dentry *dentry, *dir = path->dentry;
const struct cred *old_cred;
old_cred = ovl_override_creds(rdd->dentry->d_sb);
- err = down_write_killable(&dir->d_inode->i_rwsem);
- if (!err) {
- while (rdd->first_maybe_whiteout) {
- struct ovl_cache_entry *p =
- rdd->first_maybe_whiteout;
- rdd->first_maybe_whiteout = p->next_maybe_whiteout;
- dentry = lookup_one(mnt_idmap(path->mnt),
- &QSTR_LEN(p->name, p->len), dir);
- if (!IS_ERR(dentry)) {
- p->is_whiteout = ovl_is_whiteout(dentry);
- dput(dentry);
- }
+ while (rdd->first_maybe_whiteout) {
+ struct ovl_cache_entry *p =
+ rdd->first_maybe_whiteout;
+ rdd->first_maybe_whiteout = p->next_maybe_whiteout;
+ dentry = lookup_one_positive_killable(mnt_idmap(path->mnt),
+ &QSTR_LEN(p->name, p->len),
+ dir);
+ if (!IS_ERR(dentry)) {
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ } else if (PTR_ERR(dentry) == -EINTR) {
+ err = -EINTR;
+ break;
}
- inode_unlock(dir->d_inode);
}
ovl_revert_creds(old_cred);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index df85a76597e9..bd3d7ba8fb95 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = {
.alloc_inode = ovl_alloc_inode,
.free_inode = ovl_free_inode,
.destroy_inode = ovl_destroy_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = ovl_put_super,
.sync_fs = ovl_sync_fs,
.statfs = ovl_statfs,
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 108e7527f837..c40c29c702e5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
* erronously mistook the file descriptor for a pidfd.
* This is not perfect but will catch most cases.
*/
- return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO));
+ return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
}
return false;
@@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode)
}
static const struct super_operations pidfs_sops = {
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = pidfs_evict_inode,
.statfs = simple_statfs,
};
diff --git a/fs/pipe.c b/fs/pipe.c
index 731622d0738d..42fead1efe52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&pipe->mutex);
if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
+ if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+ send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}
@@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
+ if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+ send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d6a0369caa93..69269745d73b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
unsigned int max_fds = 0;
rcu_read_lock();
- ppid = pid_alive(p) ?
- task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-
tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
+ ppid = task_ppid_nr_ns(p, ns);
tgid = task_tgid_nr_ns(p, ns);
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76e800e38c8f..176281112273 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
+static void pde_set_flags(struct proc_dir_entry *pde)
+{
+ const struct proc_ops *proc_ops = pde->proc_ops;
+
+ if (!proc_ops)
+ return;
+
+ if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ if (proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
struct proc_dir_entry *dp)
@@ -374,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
if (proc_alloc_inum(&dp->low_ino))
goto out_free_entry;
+ if (!S_ISDIR(dp->mode))
+ pde_set_flags(dp);
+
write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
@@ -561,20 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static void pde_set_flags(struct proc_dir_entry *pde)
-{
- if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
- pde->flags |= PROC_ENTRY_PERMANENT;
- if (pde->proc_ops->proc_read_iter)
- pde->flags |= PROC_ENTRY_proc_read_iter;
-#ifdef CONFIG_COMPAT
- if (pde->proc_ops->proc_compat_ioctl)
- pde->flags |= PROC_ENTRY_proc_compat_ioctl;
-#endif
- if (pde->proc_ops->proc_lseek)
- pde->flags |= PROC_ENTRY_proc_lseek;
-}
-
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -585,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -636,7 +643,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);
@@ -667,7 +673,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
return NULL;
p->proc_ops = &proc_single_ops;
p->single_show = show;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 129490151be1..d9b7ef122343 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.free_inode = proc_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
.show_options = proc_show_options,
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 4403a2e20c16..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -12,7 +12,7 @@
#include "internal.h"
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
#ifdef CONFIG_NET_NS
&netns_operations,
#endif
@@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
struct task_struct *task = get_proc_task(file_inode(file));
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
if (!task)
return -ENOENT;
@@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
struct task_struct *task = get_proc_task(dir);
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
unsigned int len = dentry->d_name.len;
struct dentry *res = ERR_PTR(-ENOENT);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ed86ac710384..1e24e085c7d5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
- fsparam_u32("gid", Opt_gid),
+ fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
+
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ns_type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
+}
+#endif /* CONFIG_PID_NS */
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
- if (proc_parse_hidepid_param(fc, param))
- return -EINVAL;
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
break;
case Opt_subset:
- if (proc_parse_subset_param(fc, param->string) < 0)
- return -EINVAL;
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
default:
return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 29cca0e6d0ff..b26ae556b446 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2417,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
{
struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+ if (!p->vec_buf)
+ return;
+
if (cur_buf->start != addr)
cur_buf->end = addr;
else
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1a2e1185426c..b4e55c90f8dc 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc)
static const struct super_operations pstore_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = pstore_evict_inode,
.show_options = pstore_show_options,
};
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index ceb5639a0629..eb61ba5bb964 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -43,7 +43,7 @@ struct psz_buffer {
*
* @magic: magic num for kmsg dump header
* @time: kmsg dump trigger time
- * @compressed: whether conpressed
+ * @compressed: whether compressed
* @counter: kmsg dump counter
* @reason: the kmsg dump reason (e.g. oops, panic, etc)
* @data: pointer to log data
@@ -214,7 +214,7 @@ static int psz_zone_write(struct pstore_zone *zone,
atomic_set(&zone->buffer->datalen, wlen + off);
}
- /* avoid to damage old records */
+ /* avoid damaging old records */
if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered))
goto dirty;
@@ -249,7 +249,7 @@ static int psz_zone_write(struct pstore_zone *zone,
return 0;
dirty:
- /* no need to mark dirty if going to try next zone */
+ /* no need to mark it dirty if going to try next zone */
if (wcnt == -ENOMSG)
return -ENOMSG;
atomic_set(&zone->dirty, true);
@@ -378,7 +378,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt)
struct timespec64 time = { };
unsigned long i;
/*
- * Recover may on panic, we can't allocate any memory by kmalloc.
+ * Recover may happen on panic, we can't allocate any memory by kmalloc.
* So, we use local array instead.
*/
char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0};
@@ -856,11 +856,11 @@ static int notrace psz_record_write(struct pstore_zone *zone,
/**
* psz_zone_write will set datalen as start + cnt.
- * It work if actual data length lesser than buffer size.
- * If data length greater than buffer size, pmsg will rewrite to
- * beginning of zone, which make buffer->datalen wrongly.
+ * It works if actual data length is lesser than buffer size.
+ * If data length is greater than buffer size, pmsg will rewrite to
+ * the beginning of the zone, which makes buffer->datalen wrong.
* So we should reset datalen as buffer size once actual data length
- * greater than buffer size.
+ * is greater than buffer size.
*/
if (is_full_data) {
atomic_set(&zone->buffer->datalen, zone->buffer_size);
@@ -878,8 +878,9 @@ static int notrace psz_pstore_write(struct pstore_record *record)
atomic_set(&cxt->on_panic, 1);
/*
- * if on panic, do not write except panic records
- * Fix case that panic_write prints log which wakes up console backend.
+ * If on panic, do not write anything except panic records.
+ * Fix the case when panic_write prints log that wakes up
+ * console backend.
*/
if (is_on_panic() && record->type != PSTORE_TYPE_DMESG)
return -EBUSY;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..afa15a214538 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -881,7 +881,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
+ queue_delayed_work(system_dfl_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f8874c3b8c1e..41f9995da7ca 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root)
static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = ramfs_show_options,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index c5b6265d984b..833bae068770 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (len == 0)
return 0;
+ /*
+ * Make sure return value doesn't overflow in 32bit compat mode. Also
+ * limit the size for all cases except when calling ->copy_file_range().
+ */
+ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall())
+ len = min_t(size_t, MAX_RW_COUNT, len);
+
file_start_write(file_out);
/*
@@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
len, flags);
} else if (!splice && file_in->f_op->remap_file_range && samesb) {
ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out,
- min_t(loff_t, MAX_RW_COUNT, len),
- REMAP_FILE_CAN_SHORTEN);
+ file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN);
/* fallback to splice */
if (ret <= 0)
splice = true;
@@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* to splicing from input file, while file_start_write() is held on
* the output file on a different sb.
*/
- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- min_t(size_t, len, MAX_RW_COUNT), 0);
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
done:
if (ret > 0) {
fsnotify_access(file_in);
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index d98e0d2de09f..0d0ef54fc4de 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
rdt_last_cmd_clear();
if (!strcmp(buf, "mbm_local_bytes")) {
- if (resctrl_arch_is_mbm_local_enabled())
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
else
ret = -EINVAL;
} else if (!strcmp(buf, "mbm_total_bytes")) {
- if (resctrl_arch_is_mbm_total_enabled())
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
else
ret = -EINVAL;
@@ -563,10 +563,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
rr->r = r;
rr->d = d;
rr->first = first;
- rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
- if (IS_ERR(rr->arch_mon_ctx)) {
- rr->err = -EINVAL;
- return;
+ if (resctrl_arch_mbm_cntr_assign_enabled(r) &&
+ resctrl_is_mbm_event(evtid)) {
+ rr->is_mbm_cntr = true;
+ } else {
+ rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
+ if (IS_ERR(rr->arch_mon_ctx)) {
+ rr->err = -EINVAL;
+ return;
+ }
}
cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
@@ -582,7 +587,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
else
smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
- resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
+ if (rr->arch_mon_ctx)
+ resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
}
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
@@ -625,11 +631,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
*/
list_for_each_entry(d, &r->mon_domains, hdr.list) {
if (d->ci_id == domid) {
- rr.ci_id = d->ci_id;
cpu = cpumask_any(&d->hdr.cpu_mask);
ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
if (!ci)
continue;
+ rr.ci = ci;
mon_event_read(&rr, r, NULL, rdtgrp,
&ci->shared_cpu_map, evtid, false);
goto checkresult;
@@ -653,10 +659,16 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
checkresult:
+ /*
+ * -ENOENT is a special case, set only when "mbm_event" counter assignment
+ * mode is enabled and no counter has been assigned.
+ */
if (rr.err == -EIO)
seq_puts(m, "Error\n");
else if (rr.err == -EINVAL)
seq_puts(m, "Unavailable\n");
+ else if (rr.err == -ENOENT)
+ seq_puts(m, "Unassigned\n");
else
seq_printf(m, "%llu\n", rr.val);
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 0a1eedba2b03..cf1fd82dc5a9 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -52,19 +52,31 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
}
/**
- * struct mon_evt - Entry in the event list of a resource
+ * struct mon_evt - Properties of a monitor event
* @evtid: event id
+ * @rid: resource id for this event
* @name: name of the event
+ * @evt_cfg: Event configuration value that represents the
+ * memory transactions (e.g., READS_TO_LOCAL_MEM,
+ * READS_TO_REMOTE_MEM) being tracked by @evtid.
+ * Only valid if @evtid is an MBM event.
* @configurable: true if the event is configurable
- * @list: entry in &rdt_resource->evt_list
+ * @enabled: true if the event is enabled
*/
struct mon_evt {
enum resctrl_event_id evtid;
+ enum resctrl_res_level rid;
char *name;
+ u32 evt_cfg;
bool configurable;
- struct list_head list;
+ bool enabled;
};
+extern struct mon_evt mon_event_all[QOS_NUM_EVENTS];
+
+#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \
+ mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++)
+
/**
* struct mon_data - Monitoring details for each event file.
* @list: Member of the global @mon_data_kn_priv_list list.
@@ -98,7 +110,9 @@ struct mon_data {
* domains in @r sharing L3 @ci.id
* @evtid: Which monitor event to read.
* @first: Initialize MBM counter when true.
- * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it
+ * is an MBM event.
* @err: Error encountered when reading counter.
* @val: Returned value of event counter. If @rgrp is a parent resource group,
* @val includes the sum of event counts from its child resource groups.
@@ -112,7 +126,8 @@ struct rmid_read {
struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
- unsigned int ci_id;
+ struct cacheinfo *ci;
+ bool is_mbm_cntr;
int err;
u64 val;
void *arch_mon_ctx;
@@ -226,6 +241,8 @@ struct rdtgroup {
#define RFTYPE_DEBUG BIT(10)
+#define RFTYPE_ASSIGN_CONFIG BIT(11)
+
#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
@@ -375,6 +392,41 @@ bool closid_allocated(unsigned int closid);
int resctrl_find_cleanest_closid(void);
+void *rdt_kn_parent_priv(struct kernfs_node *kn);
+
+int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v);
+
+ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off);
+
+void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn,
+ bool show);
+
+int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v);
+
+int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s,
+ void *v);
+
+void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp);
+
+void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp);
+
+int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v);
+
+ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off);
+
+int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+
+ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off);
+
+int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v);
+
+ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off);
+
#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index f5637855c3ac..4076336fbba6 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid)
entry = __rmid_entry(idx);
- if (resctrl_arch_is_llc_occupancy_enabled())
+ if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID))
add_rmid_to_limbo(entry);
else
list_add_tail(&entry->list, &rmid_free_lru);
@@ -346,28 +346,97 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
u32 rmid, enum resctrl_event_id evtid)
{
u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+ struct mbm_state *state;
- switch (evtid) {
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- return &d->mbm_total[idx];
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- return &d->mbm_local[idx];
- default:
+ if (!resctrl_is_mbm_event(evtid))
return NULL;
+
+ state = d->mbm_states[MBM_STATE_IDX(evtid)];
+
+ return state ? &state[idx] : NULL;
+}
+
+/*
+ * mbm_cntr_get() - Return the counter ID for the matching @evtid and @rdtgrp.
+ *
+ * Return:
+ * Valid counter ID on success, or -ENOENT on failure.
+ */
+static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d,
+ struct rdtgroup *rdtgrp, enum resctrl_event_id evtid)
+{
+ int cntr_id;
+
+ if (!r->mon.mbm_cntr_assignable)
+ return -ENOENT;
+
+ if (!resctrl_is_mbm_event(evtid))
+ return -ENOENT;
+
+ for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) {
+ if (d->cntr_cfg[cntr_id].rdtgrp == rdtgrp &&
+ d->cntr_cfg[cntr_id].evtid == evtid)
+ return cntr_id;
+ }
+
+ return -ENOENT;
+}
+
+/*
+ * mbm_cntr_alloc() - Initialize and return a new counter ID in the domain @d.
+ * Caller must ensure that the specified event is not assigned already.
+ *
+ * Return:
+ * Valid counter ID on success, or -ENOSPC on failure.
+ */
+static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d,
+ struct rdtgroup *rdtgrp, enum resctrl_event_id evtid)
+{
+ int cntr_id;
+
+ for (cntr_id = 0; cntr_id < r->mon.num_mbm_cntrs; cntr_id++) {
+ if (!d->cntr_cfg[cntr_id].rdtgrp) {
+ d->cntr_cfg[cntr_id].rdtgrp = rdtgrp;
+ d->cntr_cfg[cntr_id].evtid = evtid;
+ return cntr_id;
+ }
}
+
+ return -ENOSPC;
}
-static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
+/*
+ * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d.
+ */
+static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id)
+{
+ memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg));
+}
+
+static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr)
{
int cpu = smp_processor_id();
+ u32 closid = rdtgrp->closid;
+ u32 rmid = rdtgrp->mon.rmid;
struct rdt_mon_domain *d;
- struct cacheinfo *ci;
+ int cntr_id = -ENOENT;
struct mbm_state *m;
int err, ret;
u64 tval = 0;
+ if (rr->is_mbm_cntr) {
+ cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid);
+ if (cntr_id < 0) {
+ rr->err = -ENOENT;
+ return -EINVAL;
+ }
+ }
+
if (rr->first) {
- resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
+ if (rr->is_mbm_cntr)
+ resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid);
+ else
+ resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
if (m)
memset(m, 0, sizeof(struct mbm_state));
@@ -378,8 +447,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
/* Reading a single domain, must be on a CPU in that domain. */
if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
return -EINVAL;
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
- rr->evtid, &tval, rr->arch_mon_ctx);
+ if (rr->is_mbm_cntr)
+ rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id,
+ rr->evtid, &tval);
+ else
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
if (rr->err)
return rr->err;
@@ -389,8 +462,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
}
/* Summing domains that share a cache, must be on a CPU for that cache. */
- ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
- if (!ci || ci->id != rr->ci_id)
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
return -EINVAL;
/*
@@ -402,10 +474,14 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
ret = -EINVAL;
list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci_id != rr->ci_id)
+ if (d->ci_id != rr->ci->id)
continue;
- err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
- rr->evtid, &tval, rr->arch_mon_ctx);
+ if (rr->is_mbm_cntr)
+ err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id,
+ rr->evtid, &tval);
+ else
+ err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
if (!err) {
rr->val += tval;
ret = 0;
@@ -421,8 +497,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
/*
* mbm_bw_count() - Update bw count from values previously read by
* __mon_event_count().
- * @closid: The closid used to identify the cached mbm_state.
- * @rmid: The rmid used to identify the cached mbm_state.
+ * @rdtgrp: resctrl group associated with the CLOSID and RMID to identify
+ * the cached mbm_state.
* @rr: The struct rmid_read populated by __mon_event_count().
*
* Supporting function to calculate the memory bandwidth
@@ -430,9 +506,11 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
* __mon_event_count() is compared with the chunks value from the previous
* invocation. This must be called once per second to maintain values in MBps.
*/
-static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
+static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr)
{
u64 cur_bw, bytes, cur_bytes;
+ u32 closid = rdtgrp->closid;
+ u32 rmid = rdtgrp->mon.rmid;
struct mbm_state *m;
m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
@@ -461,7 +539,7 @@ void mon_event_count(void *info)
rdtgrp = rr->rgrp;
- ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
+ ret = __mon_event_count(rdtgrp, rr);
/*
* For Ctrl groups read data from child monitor groups and
@@ -472,8 +550,7 @@ void mon_event_count(void *info)
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->closid, entry->mon.rmid,
- rr) == 0)
+ if (__mon_event_count(entry, rr) == 0)
ret = 0;
}
}
@@ -604,44 +681,49 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
}
static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid, enum resctrl_event_id evtid)
+ struct rdtgroup *rdtgrp, enum resctrl_event_id evtid)
{
struct rmid_read rr = {0};
rr.r = r;
rr.d = d;
rr.evtid = evtid;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
+ if (resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rr.is_mbm_cntr = true;
+ } else {
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
}
- __mon_event_count(closid, rmid, &rr);
+ __mon_event_count(rdtgrp, &rr);
/*
* If the software controller is enabled, compute the
* bandwidth for this event id.
*/
if (is_mba_sc(NULL))
- mbm_bw_count(closid, rmid, &rr);
+ mbm_bw_count(rdtgrp, &rr);
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+ if (rr.arch_mon_ctx)
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
}
static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
- u32 closid, u32 rmid)
+ struct rdtgroup *rdtgrp)
{
/*
* This is protected from concurrent reads from user as both
* the user and overflow handler hold the global mutex.
*/
- if (resctrl_arch_is_mbm_total_enabled())
- mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_TOTAL_EVENT_ID);
- if (resctrl_arch_is_mbm_local_enabled())
- mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ mbm_update_one_event(r, d, rdtgrp, QOS_L3_MBM_LOCAL_EVENT_ID);
}
/*
@@ -714,11 +796,11 @@ void mbm_handle_overflow(struct work_struct *work)
d = container_of(work, struct rdt_mon_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
+ mbm_update(r, d, prgrp);
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
+ mbm_update(r, d, crgrp);
if (is_mba_sc(NULL))
update_mba_bw(prgrp, d);
@@ -844,38 +926,819 @@ out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
-static struct mon_evt llc_occupancy_event = {
- .name = "llc_occupancy",
- .evtid = QOS_L3_OCCUP_EVENT_ID,
+/*
+ * All available events. Architecture code marks the ones that
+ * are supported by a system using resctrl_enable_mon_event()
+ * to set .enabled.
+ */
+struct mon_evt mon_event_all[QOS_NUM_EVENTS] = {
+ [QOS_L3_OCCUP_EVENT_ID] = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+ .rid = RDT_RESOURCE_L3,
+ },
+ [QOS_L3_MBM_TOTAL_EVENT_ID] = {
+ .name = "mbm_total_bytes",
+ .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+ .rid = RDT_RESOURCE_L3,
+ },
+ [QOS_L3_MBM_LOCAL_EVENT_ID] = {
+ .name = "mbm_local_bytes",
+ .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+ .rid = RDT_RESOURCE_L3,
+ },
};
-static struct mon_evt mbm_total_event = {
- .name = "mbm_total_bytes",
- .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+void resctrl_enable_mon_event(enum resctrl_event_id eventid)
+{
+ if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS))
+ return;
+ if (mon_event_all[eventid].enabled) {
+ pr_warn("Duplicate enable for event %d\n", eventid);
+ return;
+ }
+
+ mon_event_all[eventid].enabled = true;
+}
+
+bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid)
+{
+ return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS &&
+ mon_event_all[eventid].enabled;
+}
+
+u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid)
+{
+ return mon_event_all[evtid].evt_cfg;
+}
+
+/**
+ * struct mbm_transaction - Memory transaction an MBM event can be configured with.
+ * @name: Name of memory transaction (read, write ...).
+ * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to
+ * represent the memory transaction within an event's configuration.
+ */
+struct mbm_transaction {
+ char name[32];
+ u32 val;
};
-static struct mon_evt mbm_local_event = {
- .name = "mbm_local_bytes",
- .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+/* Decoded values for each type of memory transaction. */
+static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = {
+ {"local_reads", READS_TO_LOCAL_MEM},
+ {"remote_reads", READS_TO_REMOTE_MEM},
+ {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM},
+ {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM},
+ {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM},
+ {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM},
+ {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM},
};
+int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v)
+{
+ struct mon_evt *mevt = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r;
+ bool sep = false;
+ int ret = 0, i;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdt_last_cmd_clear();
+
+ r = resctrl_arch_get_resource(mevt->rid);
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) {
+ if (mevt->evt_cfg & mbm_transactions[i].val) {
+ if (sep)
+ seq_putc(seq, ',');
+ seq_printf(seq, "%s", mbm_transactions[i].name);
+ sep = true;
+ }
+ }
+ seq_putc(seq, '\n');
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s,
+ void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ int ret = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdt_last_cmd_clear();
+
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ bool value;
+ int ret;
+
+ ret = kstrtobool(buf, &value);
+ if (ret)
+ return ret;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdt_last_cmd_clear();
+
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ r->mon.mbm_assign_on_mkdir = value;
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret ?: nbytes;
+}
+
+/*
+ * mbm_cntr_free_all() - Clear all the counter ID configuration details in the
+ * domain @d. Called when mbm_assign_mode is changed.
+ */
+static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs);
+}
+
+/*
+ * resctrl_reset_rmid_all() - Reset all non-architecture states for all the
+ * supported RMIDs.
+ */
+static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+ enum resctrl_event_id evt;
+ int idx;
+
+ for_each_mbm_event_id(evt) {
+ if (!resctrl_is_mon_event_enabled(evt))
+ continue;
+ idx = MBM_STATE_IDX(evt);
+ memset(d->mbm_states[idx], 0, sizeof(*d->mbm_states[0]) * idx_limit);
+ }
+}
+
+/*
+ * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID
+ * pair in the domain.
+ *
+ * Assign the counter if @assign is true else unassign the counter. Reset the
+ * associated non-architectural state.
+ */
+static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ enum resctrl_event_id evtid, u32 rmid, u32 closid,
+ u32 cntr_id, bool assign)
+{
+ struct mbm_state *m;
+
+ resctrl_arch_config_cntr(r, d, evtid, rmid, closid, cntr_id, assign);
+
+ m = get_mbm_state(d, closid, rmid, evtid);
+ if (m)
+ memset(m, 0, sizeof(*m));
+}
+
+/*
+ * rdtgroup_alloc_assign_cntr() - Allocate a counter ID and assign it to the event
+ * pointed to by @mevt and the resctrl group @rdtgrp within the domain @d.
+ *
+ * Return:
+ * 0 on success, < 0 on failure.
+ */
+static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ struct rdtgroup *rdtgrp, struct mon_evt *mevt)
+{
+ int cntr_id;
+
+ /* No action required if the counter is assigned already. */
+ cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid);
+ if (cntr_id >= 0)
+ return 0;
+
+ cntr_id = mbm_cntr_alloc(r, d, rdtgrp, mevt->evtid);
+ if (cntr_id < 0) {
+ rdt_last_cmd_printf("Failed to allocate counter for %s in domain %d\n",
+ mevt->name, d->hdr.id);
+ return cntr_id;
+ }
+
+ rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, true);
+
+ return 0;
+}
+
/*
- * Initialize the event list for the resource.
+ * rdtgroup_assign_cntr_event() - Assign a hardware counter for the event in
+ * @mevt to the resctrl group @rdtgrp. Assign counters to all domains if @d is
+ * NULL; otherwise, assign the counter to the specified domain @d.
+ *
+ * If all counters in a domain are already in use, rdtgroup_alloc_assign_cntr()
+ * will fail. The assignment process will abort at the first failure encountered
+ * during domain traversal, which may result in the event being only partially
+ * assigned.
*
- * Note that MBM events are also part of RDT_RESOURCE_L3 resource
- * because as per the SDM the total and local memory bandwidth
- * are enumerated as part of L3 monitoring.
+ * Return:
+ * 0 on success, < 0 on failure.
+ */
+static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ struct mon_evt *mevt)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
+ int ret = 0;
+
+ if (!d) {
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ret = rdtgroup_alloc_assign_cntr(r, d, rdtgrp, mevt);
+ }
+
+ return ret;
+}
+
+/*
+ * rdtgroup_assign_cntrs() - Assign counters to MBM events. Called when
+ * a new group is created.
+ *
+ * Each group can accommodate two counters per domain: one for the total
+ * event and one for the local event. Assignments may fail due to the limited
+ * number of counters. However, it is not necessary to fail the group creation
+ * and thus no failure is returned. Users have the option to modify the
+ * counter assignments after the group has been created.
+ */
+void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+ if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r) ||
+ !r->mon.mbm_assign_on_mkdir)
+ return;
+
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ rdtgroup_assign_cntr_event(NULL, rdtgrp,
+ &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
+
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ rdtgroup_assign_cntr_event(NULL, rdtgrp,
+ &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
+}
+
+/*
+ * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration
+ * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp.
+ */
+static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ struct rdtgroup *rdtgrp, struct mon_evt *mevt)
+{
+ int cntr_id;
+
+ cntr_id = mbm_cntr_get(r, d, rdtgrp, mevt->evtid);
+
+ /* If there is no cntr_id assigned, nothing to do */
+ if (cntr_id < 0)
+ return;
+
+ rdtgroup_assign_cntr(r, d, mevt->evtid, rdtgrp->mon.rmid, rdtgrp->closid, cntr_id, false);
+
+ mbm_cntr_free(d, cntr_id);
+}
+
+/*
+ * rdtgroup_unassign_cntr_event() - Unassign a hardware counter associated with
+ * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign
+ * the counters from all the domains if @d is NULL else unassign from @d.
+ */
+static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ struct mon_evt *mevt)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
+
+ if (!d) {
+ list_for_each_entry(d, &r->mon_domains, hdr.list)
+ rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
+ } else {
+ rdtgroup_free_unassign_cntr(r, d, rdtgrp, mevt);
+ }
+}
+
+/*
+ * rdtgroup_unassign_cntrs() - Unassign the counters associated with MBM events.
+ * Called when a group is deleted.
*/
-static void l3_mon_evt_init(struct rdt_resource *r)
+void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp)
{
- INIT_LIST_HEAD(&r->evt_list);
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- if (resctrl_arch_is_llc_occupancy_enabled())
- list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (resctrl_arch_is_mbm_total_enabled())
- list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (resctrl_arch_is_mbm_local_enabled())
- list_add_tail(&mbm_local_event.list, &r->evt_list);
+ if (!r->mon_capable || !resctrl_arch_mbm_cntr_assign_enabled(r))
+ return;
+
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ rdtgroup_unassign_cntr_event(NULL, rdtgrp,
+ &mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID]);
+
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ rdtgroup_unassign_cntr_event(NULL, rdtgrp,
+ &mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID]);
+}
+
+static int resctrl_parse_mem_transactions(char *tok, u32 *val)
+{
+ u32 temp_val = 0;
+ char *evt_str;
+ bool found;
+ int i;
+
+next_config:
+ if (!tok || tok[0] == '\0') {
+ *val = temp_val;
+ return 0;
+ }
+
+ /* Start processing the strings for each memory transaction type */
+ evt_str = strim(strsep(&tok, ","));
+ found = false;
+ for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) {
+ if (!strcmp(mbm_transactions[i].name, evt_str)) {
+ temp_val |= mbm_transactions[i].val;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ rdt_last_cmd_printf("Invalid memory transaction type %s\n", evt_str);
+ return -EINVAL;
+ }
+
+ goto next_config;
+}
+
+/*
+ * rdtgroup_update_cntr_event - Update the counter assignments for the event
+ * in a group.
+ * @r: Resource to which update needs to be done.
+ * @rdtgrp: Resctrl group.
+ * @evtid: MBM monitor event.
+ */
+static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp,
+ enum resctrl_event_id evtid)
+{
+ struct rdt_mon_domain *d;
+ int cntr_id;
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ cntr_id = mbm_cntr_get(r, d, rdtgrp, evtid);
+ if (cntr_id >= 0)
+ rdtgroup_assign_cntr(r, d, evtid, rdtgrp->mon.rmid,
+ rdtgrp->closid, cntr_id, true);
+ }
+}
+
+/*
+ * resctrl_update_cntr_allrdtgrp - Update the counter assignments for the event
+ * for all the groups.
+ * @mevt MBM Monitor event.
+ */
+static void resctrl_update_cntr_allrdtgrp(struct mon_evt *mevt)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid);
+ struct rdtgroup *prgrp, *crgrp;
+
+ /*
+ * Find all the groups where the event is assigned and update the
+ * configuration of existing assignments.
+ */
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ rdtgroup_update_cntr_event(r, prgrp, mevt->evtid);
+
+ list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+ rdtgroup_update_cntr_event(r, crgrp, mevt->evtid);
+ }
+}
+
+ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct mon_evt *mevt = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r;
+ u32 evt_cfg = 0;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ buf[nbytes - 1] = '\0';
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ r = resctrl_arch_get_resource(mevt->rid);
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = resctrl_parse_mem_transactions(buf, &evt_cfg);
+ if (!ret && mevt->evt_cfg != evt_cfg) {
+ mevt->evt_cfg = evt_cfg;
+ resctrl_update_cntr_allrdtgrp(mevt);
+ }
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return ret ?: nbytes;
+}
+
+int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ bool enabled;
+
+ mutex_lock(&rdtgroup_mutex);
+ enabled = resctrl_arch_mbm_cntr_assign_enabled(r);
+
+ if (r->mon.mbm_cntr_assignable) {
+ if (enabled)
+ seq_puts(s, "[mbm_event]\n");
+ else
+ seq_puts(s, "[default]\n");
+
+ if (!IS_ENABLED(CONFIG_RESCTRL_ASSIGN_FIXED)) {
+ if (enabled)
+ seq_puts(s, "default\n");
+ else
+ seq_puts(s, "mbm_event\n");
+ }
+ } else {
+ seq_puts(s, "[default]\n");
+ }
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ struct rdt_mon_domain *d;
+ int ret = 0;
+ bool enable;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ buf[nbytes - 1] = '\0';
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ if (!strcmp(buf, "default")) {
+ enable = 0;
+ } else if (!strcmp(buf, "mbm_event")) {
+ if (r->mon.mbm_cntr_assignable) {
+ enable = 1;
+ } else {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("mbm_event mode is not supported\n");
+ goto out_unlock;
+ }
+ } else {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("Unsupported assign mode\n");
+ goto out_unlock;
+ }
+
+ if (enable != resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ ret = resctrl_arch_mbm_cntr_assign_set(r, enable);
+ if (ret)
+ goto out_unlock;
+
+ /* Update the visibility of BMEC related files */
+ resctrl_bmec_files_show(r, NULL, !enable);
+
+ /*
+ * Initialize the default memory transaction values for
+ * total and local events.
+ */
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
+ (READS_TO_LOCAL_MEM |
+ READS_TO_LOCAL_S_MEM |
+ NON_TEMP_WRITE_TO_LOCAL_MEM);
+ /* Enable auto assignment when switching to "mbm_event" mode */
+ if (enable)
+ r->mon.mbm_assign_on_mkdir = true;
+ /*
+ * Reset all the non-achitectural RMID state and assignable counters.
+ */
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ mbm_cntr_free_all(r, d);
+ resctrl_reset_rmid_all(r, d);
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return ret ?: nbytes;
+}
+
+int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ struct rdt_mon_domain *dom;
+ bool sep = false;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+ if (sep)
+ seq_putc(s, ';');
+
+ seq_printf(s, "%d=%d", dom->hdr.id, r->mon.num_mbm_cntrs);
+ sep = true;
+ }
+ seq_putc(s, '\n');
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return 0;
+}
+
+int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ struct rdt_mon_domain *dom;
+ bool sep = false;
+ u32 cntrs, i;
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+ if (sep)
+ seq_putc(s, ';');
+
+ cntrs = 0;
+ for (i = 0; i < r->mon.num_mbm_cntrs; i++) {
+ if (!dom->cntr_cfg[i].rdtgrp)
+ cntrs++;
+ }
+
+ seq_printf(s, "%d=%u", dom->hdr.id, cntrs);
+ sep = true;
+ }
+ seq_putc(s, '\n');
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+
+int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_mon_domain *d;
+ struct rdtgroup *rdtgrp;
+ struct mon_evt *mevt;
+ int ret = 0;
+ bool sep;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ rdt_last_cmd_clear();
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ for_each_mon_event(mevt) {
+ if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid))
+ continue;
+
+ sep = false;
+ seq_printf(s, "%s:", mevt->name);
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (sep)
+ seq_putc(s, ';');
+
+ if (mbm_cntr_get(r, d, rdtgrp, mevt->evtid) < 0)
+ seq_printf(s, "%d=_", d->hdr.id);
+ else
+ seq_printf(s, "%d=e", d->hdr.id);
+
+ sep = true;
+ }
+ seq_putc(s, '\n');
+ }
+
+out_unlock:
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching
+ * event name.
+ */
+static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *name)
+{
+ struct mon_evt *mevt;
+
+ for_each_mon_event(mevt) {
+ if (mevt->rid == r->rid && mevt->enabled &&
+ resctrl_is_mbm_event(mevt->evtid) &&
+ !strcmp(mevt->name, name))
+ return mevt;
+ }
+
+ return NULL;
+}
+
+static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d,
+ struct rdtgroup *rdtgrp, struct mon_evt *mevt)
+{
+ int ret = 0;
+
+ if (!assign || strlen(assign) != 1)
+ return -EINVAL;
+
+ switch (*assign) {
+ case 'e':
+ ret = rdtgroup_assign_cntr_event(d, rdtgrp, mevt);
+ break;
+ case '_':
+ rdtgroup_unassign_cntr_event(d, rdtgrp, mevt);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp,
+ char *event, char *tok)
+{
+ struct rdt_mon_domain *d;
+ unsigned long dom_id = 0;
+ char *dom_str, *id_str;
+ struct mon_evt *mevt;
+ int ret;
+
+ mevt = mbm_get_mon_event_by_name(r, event);
+ if (!mevt) {
+ rdt_last_cmd_printf("Invalid event %s\n", event);
+ return -ENOENT;
+ }
+
+next:
+ if (!tok || tok[0] == '\0')
+ return 0;
+
+ /* Start processing the strings for each domain */
+ dom_str = strim(strsep(&tok, ";"));
+
+ id_str = strsep(&dom_str, "=");
+
+ /* Check for domain id '*' which means all domains */
+ if (id_str && *id_str == '*') {
+ ret = rdtgroup_modify_assign_state(dom_str, NULL, rdtgrp, mevt);
+ if (ret)
+ rdt_last_cmd_printf("Assign operation '%s:*=%s' failed\n",
+ event, dom_str);
+ return ret;
+ } else if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing domain id\n");
+ return -EINVAL;
+ }
+
+ /* Verify if the dom_id is valid */
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
+ ret = rdtgroup_modify_assign_state(dom_str, d, rdtgrp, mevt);
+ if (ret) {
+ rdt_last_cmd_printf("Assign operation '%s:%ld=%s' failed\n",
+ event, dom_id, dom_str);
+ return ret;
+ }
+ goto next;
+ }
+ }
+
+ rdt_last_cmd_printf("Invalid domain id %ld\n", dom_id);
+ return -EINVAL;
+}
+
+ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdtgroup *rdtgrp;
+ char *token, *event;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (!resctrl_arch_mbm_cntr_assign_enabled(r)) {
+ rdt_last_cmd_puts("mbm_event mode is not enabled\n");
+ rdtgroup_kn_unlock(of->kn);
+ return -EINVAL;
+ }
+
+ while ((token = strsep(&buf, "\n")) != NULL) {
+ /*
+ * The write command follows the following format:
+ * "<Event>:<Domain ID>=<Assignment state>"
+ * Extract the event name first.
+ */
+ event = strsep(&token, ":");
+
+ ret = resctrl_parse_mbm_assignment(r, rdtgrp, event, token);
+ if (ret)
+ break;
+ }
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
}
/**
@@ -902,24 +1765,43 @@ int resctrl_mon_resource_init(void)
if (ret)
return ret;
- l3_mon_evt_init(r);
-
if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
- mbm_total_event.configurable = true;
+ mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true;
resctrl_file_fflags_init("mbm_total_bytes_config",
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
}
if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
- mbm_local_event.configurable = true;
+ mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true;
resctrl_file_fflags_init("mbm_local_bytes_config",
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
}
- if (resctrl_arch_is_mbm_local_enabled())
+ if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
- else if (resctrl_arch_is_mbm_total_enabled())
+ else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+ if (r->mon.mbm_cntr_assignable) {
+ if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+ resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID))
+ resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+ mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask;
+ mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask &
+ (READS_TO_LOCAL_MEM |
+ READS_TO_LOCAL_S_MEM |
+ NON_TEMP_WRITE_TO_LOCAL_MEM);
+ r->mon.mbm_assign_on_mkdir = true;
+ resctrl_file_fflags_init("num_mbm_cntrs",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ resctrl_file_fflags_init("available_mbm_cntrs",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG);
+ resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO |
+ RFTYPE_RES_CACHE);
+ resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE);
+ }
+
return 0;
}
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 77d08229d855..0320360cd7a6 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -123,14 +123,8 @@ void rdt_staged_configs_clear(void)
static bool resctrl_is_mbm_enabled(void)
{
- return (resctrl_arch_is_mbm_total_enabled() ||
- resctrl_arch_is_mbm_local_enabled());
-}
-
-static bool resctrl_is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+ return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) ||
+ resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID));
}
/*
@@ -196,7 +190,7 @@ static int closid_alloc(void)
lockdep_assert_held(&rdtgroup_mutex);
if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
- resctrl_arch_is_llc_occupancy_enabled()) {
+ resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) {
cleanest_closid = resctrl_find_cleanest_closid();
if (cleanest_closid < 0)
return cleanest_closid;
@@ -981,7 +975,7 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
return 0;
}
-static void *rdt_kn_parent_priv(struct kernfs_node *kn)
+void *rdt_kn_parent_priv(struct kernfs_node *kn)
{
/*
* The parent pointer is only valid within RCU section since it can be
@@ -1141,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of,
{
struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
- seq_printf(seq, "%d\n", r->num_rmid);
+ seq_printf(seq, "%d\n", r->mon.num_rmid);
return 0;
}
@@ -1152,9 +1146,12 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
struct mon_evt *mevt;
- list_for_each_entry(mevt, &r->evt_list, list) {
+ for_each_mon_event(mevt) {
+ if (mevt->rid != r->rid || !mevt->enabled)
+ continue;
seq_printf(seq, "%s\n", mevt->name);
- if (mevt->configurable)
+ if (mevt->configurable &&
+ !resctrl_arch_mbm_cntr_assign_enabled(r))
seq_printf(seq, "%s_config\n", mevt->name);
}
@@ -1735,9 +1732,9 @@ next:
}
/* Value from user cannot be more than the supported set of events */
- if ((val & r->mbm_cfg_mask) != val) {
+ if ((val & r->mon.mbm_cfg_mask) != val) {
rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
- r->mbm_cfg_mask);
+ r->mon.mbm_cfg_mask);
return -EINVAL;
}
@@ -1803,6 +1800,44 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
return ret ?: nbytes;
}
+/*
+ * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl
+ * files. When @show is true, the files are displayed; when false, the files
+ * are hidden.
+ * Don't treat kernfs_find_and_get failure as an error, since this function may
+ * be called regardless of whether BMEC is supported or the event is enabled.
+ */
+void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn,
+ bool show)
+{
+ struct kernfs_node *kn_config, *mon_kn = NULL;
+ char name[32];
+
+ if (!l3_mon_kn) {
+ sprintf(name, "%s_MON", r->name);
+ mon_kn = kernfs_find_and_get(kn_info, name);
+ if (!mon_kn)
+ return;
+ l3_mon_kn = mon_kn;
+ }
+
+ kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config");
+ if (kn_config) {
+ kernfs_show(kn_config, show);
+ kernfs_put(kn_config);
+ }
+
+ kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config");
+ if (kn_config) {
+ kernfs_show(kn_config, show);
+ kernfs_put(kn_config);
+ }
+
+ /* Release the reference only if it was acquired */
+ if (mon_kn)
+ kernfs_put(mon_kn);
+}
+
/* rdtgroup information files for one cache resource. */
static struct rftype res_common_files[] = {
{
@@ -1813,6 +1848,13 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_TOP_INFO,
},
{
+ .name = "mbm_assign_on_mkdir",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = resctrl_mbm_assign_on_mkdir_show,
+ .write = resctrl_mbm_assign_on_mkdir_write,
+ },
+ {
.name = "num_closids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -1827,6 +1869,12 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_MON_INFO,
},
{
+ .name = "available_mbm_cntrs",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = resctrl_available_mbm_cntrs_show,
+ },
+ {
.name = "num_rmids",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -1841,6 +1889,12 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
+ .name = "num_mbm_cntrs",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = resctrl_num_mbm_cntrs_show,
+ },
+ {
.name = "min_cbm_bits",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -1916,6 +1970,28 @@ static struct rftype res_common_files[] = {
.write = mbm_local_bytes_config_write,
},
{
+ .name = "event_filter",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = event_filter_show,
+ .write = event_filter_write,
+ },
+ {
+ .name = "mbm_L3_assignments",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_L3_assignments_show,
+ .write = mbm_L3_assignments_write,
+ },
+ {
+ .name = "mbm_assign_mode",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = resctrl_mbm_assign_mode_show,
+ .write = resctrl_mbm_assign_mode_write,
+ .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
+ },
+ {
.name = "cpus",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -2168,10 +2244,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
return ret;
}
+static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn)
+{
+ struct kernfs_node *kn_subdir, *kn_subdir2;
+ struct mon_evt *mevt;
+ int ret;
+
+ kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL);
+ if (IS_ERR(kn_subdir))
+ return PTR_ERR(kn_subdir);
+
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ return ret;
+
+ for_each_mon_event(mevt) {
+ if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid))
+ continue;
+
+ kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt);
+ if (IS_ERR(kn_subdir2)) {
+ ret = PTR_ERR(kn_subdir2);
+ goto out;
+ }
+
+ ret = rdtgroup_kn_set_ugid(kn_subdir2);
+ if (ret)
+ goto out;
+
+ ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG);
+ if (ret)
+ break;
+ }
+
+out:
+ return ret;
+}
+
static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
unsigned long fflags)
{
struct kernfs_node *kn_subdir;
+ struct rdt_resource *r;
int ret;
kn_subdir = kernfs_create_dir(kn_info, name,
@@ -2184,8 +2298,25 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
return ret;
ret = rdtgroup_add_files(kn_subdir, fflags);
- if (!ret)
- kernfs_activate(kn_subdir);
+ if (ret)
+ return ret;
+
+ if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) {
+ r = priv;
+ if (r->mon.mbm_cntr_assignable) {
+ ret = resctrl_mkdir_event_configs(r, kn_subdir);
+ if (ret)
+ return ret;
+ /*
+ * Hide BMEC related files if mbm_event mode
+ * is enabled.
+ */
+ if (resctrl_arch_mbm_cntr_assign_enabled(r))
+ resctrl_bmec_files_show(r, kn_subdir, false);
+ }
+ }
+
+ kernfs_activate(kn_subdir);
return ret;
}
@@ -2608,10 +2739,8 @@ static int rdt_get_tree(struct fs_context *fc)
goto out_root;
ret = schemata_list_create();
- if (ret) {
- schemata_list_destroy();
- goto out_ctx;
- }
+ if (ret)
+ goto out_schemata_free;
ret = closid_init();
if (ret)
@@ -2637,6 +2766,8 @@ static int rdt_get_tree(struct fs_context *fc)
if (ret < 0)
goto out_info;
+ rdtgroup_assign_cntrs(&rdtgroup_default);
+
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
if (ret < 0)
@@ -2675,15 +2806,16 @@ out_mondata:
if (resctrl_arch_mon_capable())
kernfs_remove(kn_mondata);
out_mongrp:
- if (resctrl_arch_mon_capable())
+ if (resctrl_arch_mon_capable()) {
+ rdtgroup_unassign_cntrs(&rdtgroup_default);
kernfs_remove(kn_mongrp);
+ }
out_info:
kernfs_remove(kn_info);
out_closid_exit:
closid_exit();
out_schemata_free:
schemata_list_destroy();
-out_ctx:
rdt_disable_ctx();
out_root:
rdtgroup_destroy_root();
@@ -2822,6 +2954,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
head = &rdtgrp->mon.crdtgrp_list;
list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+ rdtgroup_unassign_cntrs(sentry);
free_rmid(sentry->closid, sentry->mon.rmid);
list_del(&sentry->mon.crdtgrp_list);
@@ -2862,6 +2995,8 @@ static void rmdir_all_sub(void)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+ rdtgroup_unassign_cntrs(rdtgrp);
+
free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
kernfs_remove(rdtgrp->kn);
@@ -2946,6 +3081,7 @@ static void resctrl_fs_teardown(void)
return;
rmdir_all_sub();
+ rdtgroup_unassign_cntrs(&rdtgroup_default);
mon_put_kn_priv();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
@@ -3057,10 +3193,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
struct mon_evt *mevt;
int ret, domid;
- if (WARN_ON(list_empty(&r->evt_list)))
- return -EPERM;
-
- list_for_each_entry(mevt, &r->evt_list, list) {
+ for_each_mon_event(mevt) {
+ if (mevt->rid != r->rid || !mevt->enabled)
+ continue;
domid = do_sum ? d->ci_id : d->hdr.id;
priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
if (WARN_ON_ONCE(!priv))
@@ -3427,9 +3562,12 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
}
rdtgrp->mon.rmid = ret;
+ rdtgroup_assign_cntrs(rdtgrp);
+
ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
if (ret) {
rdt_last_cmd_puts("kernfs subdir error\n");
+ rdtgroup_unassign_cntrs(rdtgrp);
free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
return ret;
}
@@ -3439,8 +3577,10 @@ static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
{
- if (resctrl_arch_mon_capable())
+ if (resctrl_arch_mon_capable()) {
+ rdtgroup_unassign_cntrs(rgrp);
free_rmid(rgrp->closid, rgrp->mon.rmid);
+ }
}
/*
@@ -3716,6 +3856,9 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
update_closid_rmid(tmpmask, NULL);
rdtgrp->flags = RDT_DELETED;
+
+ rdtgroup_unassign_cntrs(rdtgrp);
+
free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
/*
@@ -3763,6 +3906,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
update_closid_rmid(tmpmask, NULL);
+ rdtgroup_unassign_cntrs(rdtgrp);
+
free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
closid_free(rdtgrp->closid);
@@ -4022,9 +4167,14 @@ static void rdtgroup_setup_default(void)
static void domain_destroy_mon_state(struct rdt_mon_domain *d)
{
+ int idx;
+
+ kfree(d->cntr_cfg);
bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- kfree(d->mbm_local);
+ for_each_mbm_idx(idx) {
+ kfree(d->mbm_states[idx]);
+ d->mbm_states[idx] = NULL;
+ }
}
void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
@@ -4050,7 +4200,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
if (resctrl_is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
- if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+ if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
* decrement rmid->ebusy. There is no way to know
@@ -4084,32 +4234,41 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d
static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
- size_t tsize;
+ size_t tsize = sizeof(*d->mbm_states[0]);
+ enum resctrl_event_id eventid;
+ int idx;
- if (resctrl_arch_is_llc_occupancy_enabled()) {
+ if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) {
d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
}
- if (resctrl_arch_is_mbm_total_enabled()) {
- tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
- if (!d->mbm_total) {
- bitmap_free(d->rmid_busy_llc);
- return -ENOMEM;
- }
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL);
+ if (!d->mbm_states[idx])
+ goto cleanup;
}
- if (resctrl_arch_is_mbm_local_enabled()) {
- tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
- if (!d->mbm_local) {
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- return -ENOMEM;
- }
+
+ if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) {
+ tsize = sizeof(*d->cntr_cfg);
+ d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL);
+ if (!d->cntr_cfg)
+ goto cleanup;
}
return 0;
+cleanup:
+ bitmap_free(d->rmid_busy_llc);
+ for_each_mbm_idx(idx) {
+ kfree(d->mbm_states[idx]);
+ d->mbm_states[idx] = NULL;
+ }
+
+ return -ENOMEM;
}
int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
@@ -4144,7 +4303,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
RESCTRL_PICK_ANY_CPU);
}
- if (resctrl_arch_is_llc_occupancy_enabled())
+ if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID))
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
/*
@@ -4219,7 +4378,7 @@ void resctrl_offline_cpu(unsigned int cpu)
cancel_delayed_work(&d->mbm_over);
mbm_setup_overflow_handler(d, 0, cpu);
}
- if (resctrl_arch_is_llc_occupancy_enabled() &&
+ if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) &&
cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
cancel_delayed_work(&d->cqm_limbo);
cqm_setup_limbo_handler(d, 0, cpu);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index beb4f18f05ef..35c4d27d2cc0 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -24,6 +24,7 @@
#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
#include "smbdirect.h"
+#include "../common/smbdirect/smbdirect_pdu.h"
#endif
#include "cifs_swn.h"
#include "cached_dir.h"
@@ -304,6 +305,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
cfids = tcon->cfids;
+ if (!cfids)
+ continue;
spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
seq_printf(m, "Num entries: %d\n", cfids->num_entries);
list_for_each_entry(cfid, &cfids->entries, entry) {
@@ -319,8 +322,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n");
}
spin_unlock(&cfids->cfid_list_lock);
-
-
}
}
}
@@ -347,6 +348,22 @@ static __always_inline const char *compression_alg_str(__le16 alg)
}
}
+static __always_inline const char *cipher_alg_str(__le16 cipher)
+{
+ switch (cipher) {
+ case SMB2_ENCRYPTION_AES128_CCM:
+ return "AES128-CCM";
+ case SMB2_ENCRYPTION_AES128_GCM:
+ return "AES128-GCM";
+ case SMB2_ENCRYPTION_AES256_CCM:
+ return "AES256-CCM";
+ case SMB2_ENCRYPTION_AES256_GCM:
+ return "AES256-GCM";
+ default:
+ return "UNKNOWN";
+ }
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct mid_q_entry *mid_entry;
@@ -440,57 +457,55 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
sc = &server->smbd_conn->socket;
sp = &sc->parameters;
- seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
- "transport status: %x",
- server->smbd_conn->protocol,
- server->smbd_conn->socket.status);
- seq_printf(m, "\nConn receive_credit_max: %x "
- "send_credit_target: %x max_send_size: %x",
+ seq_printf(m, "\nSMBDirect protocol version: 0x%x "
+ "transport status: %s (%u)",
+ SMBDIRECT_V1,
+ smbdirect_socket_status_string(sc->status),
+ sc->status);
+ seq_printf(m, "\nConn receive_credit_max: %u "
+ "send_credit_target: %u max_send_size: %u",
sp->recv_credit_max,
sp->send_credit_target,
sp->max_send_size);
- seq_printf(m, "\nConn max_fragmented_recv_size: %x "
- "max_fragmented_send_size: %x max_receive_size:%x",
+ seq_printf(m, "\nConn max_fragmented_recv_size: %u "
+ "max_fragmented_send_size: %u max_receive_size:%u",
sp->max_fragmented_recv_size,
sp->max_fragmented_send_size,
sp->max_recv_size);
- seq_printf(m, "\nConn keep_alive_interval: %x "
- "max_readwrite_size: %x rdma_readwrite_threshold: %x",
+ seq_printf(m, "\nConn keep_alive_interval: %u "
+ "max_readwrite_size: %u rdma_readwrite_threshold: %u",
sp->keepalive_interval_msec * 1000,
sp->max_read_write_size,
- server->smbd_conn->rdma_readwrite_threshold);
- seq_printf(m, "\nDebug count_get_receive_buffer: %x "
- "count_put_receive_buffer: %x count_send_empty: %x",
- server->smbd_conn->count_get_receive_buffer,
- server->smbd_conn->count_put_receive_buffer,
- server->smbd_conn->count_send_empty);
- seq_printf(m, "\nRead Queue count_reassembly_queue: %x "
- "count_enqueue_reassembly_queue: %x "
- "count_dequeue_reassembly_queue: %x "
- "reassembly_data_length: %x "
- "reassembly_queue_length: %x",
- server->smbd_conn->count_reassembly_queue,
- server->smbd_conn->count_enqueue_reassembly_queue,
- server->smbd_conn->count_dequeue_reassembly_queue,
+ server->rdma_readwrite_threshold);
+ seq_printf(m, "\nDebug count_get_receive_buffer: %llu "
+ "count_put_receive_buffer: %llu count_send_empty: %llu",
+ sc->statistics.get_receive_buffer,
+ sc->statistics.put_receive_buffer,
+ sc->statistics.send_empty);
+ seq_printf(m, "\nRead Queue "
+ "count_enqueue_reassembly_queue: %llu "
+ "count_dequeue_reassembly_queue: %llu "
+ "reassembly_data_length: %u "
+ "reassembly_queue_length: %u",
+ sc->statistics.enqueue_reassembly_queue,
+ sc->statistics.dequeue_reassembly_queue,
sc->recv_io.reassembly.data_length,
sc->recv_io.reassembly.queue_length);
- seq_printf(m, "\nCurrent Credits send_credits: %x "
- "receive_credits: %x receive_credit_target: %x",
- atomic_read(&server->smbd_conn->send_credits),
- atomic_read(&server->smbd_conn->receive_credits),
- server->smbd_conn->receive_credit_target);
- seq_printf(m, "\nPending send_pending: %x ",
- atomic_read(&server->smbd_conn->send_pending));
- seq_printf(m, "\nReceive buffers count_receive_queue: %x ",
- server->smbd_conn->count_receive_queue);
- seq_printf(m, "\nMR responder_resources: %x "
- "max_frmr_depth: %x mr_type: %x",
- server->smbd_conn->responder_resources,
- server->smbd_conn->max_frmr_depth,
- server->smbd_conn->mr_type);
- seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x",
- atomic_read(&server->smbd_conn->mr_ready_count),
- atomic_read(&server->smbd_conn->mr_used_count));
+ seq_printf(m, "\nCurrent Credits send_credits: %u "
+ "receive_credits: %u receive_credit_target: %u",
+ atomic_read(&sc->send_io.credits.count),
+ atomic_read(&sc->recv_io.credits.count),
+ sc->recv_io.credits.target);
+ seq_printf(m, "\nPending send_pending: %u ",
+ atomic_read(&sc->send_io.pending.count));
+ seq_printf(m, "\nMR responder_resources: %u "
+ "max_frmr_depth: %u mr_type: 0x%x",
+ sp->responder_resources,
+ sp->max_frmr_depth,
+ sc->mr_io.type);
+ seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u",
+ atomic_read(&sc->mr_io.ready.count),
+ atomic_read(&sc->mr_io.used.count));
skip_rdma:
#endif
seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x",
@@ -539,6 +554,11 @@ skip_rdma:
else
seq_puts(m, "disabled (not supported by this server)");
+ /* Show negotiated encryption cipher, even if not required */
+ seq_puts(m, "\nEncryption: ");
+ if (server->cipher_type)
+ seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type));
+
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -576,12 +596,8 @@ skip_rdma:
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
- if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) {
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
seq_puts(m, " encrypted");
- /* can help in debugging to show encryption type */
- if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
- seq_puts(m, "(gcm256)");
- }
if (ses->sign)
seq_puts(m, " signed");
diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c
index 4cc6e0896fad..f8659d36793f 100644
--- a/fs/smb/client/cifs_unicode.c
+++ b/fs/smb/client/cifs_unicode.c
@@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
int len;
__le16 *dst;
+ if (!src)
+ return NULL;
+
len = cifs_local_to_utf16_bytes(src, maxlen, cp);
len += 2; /* NULL */
dst = kmalloc(len, GFP_KERNEL);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index e1848276bab4..dcb39d1b5958 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode)
/* no serverino => unconditional eviction */
return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
- generic_drop_inode(inode);
+ inode_generic_drop(inode);
}
static const struct super_operations cifs_super_ops = {
@@ -1895,7 +1895,9 @@ init_cifs(void)
cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n");
}
- cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ cifsiod_wq = alloc_workqueue("cifsiod",
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!cifsiod_wq) {
rc = -ENOMEM;
goto out_clean_proc;
@@ -1923,28 +1925,32 @@ init_cifs(void)
}
cifsoplockd_wq = alloc_workqueue("cifsoplockd",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!cifsoplockd_wq) {
rc = -ENOMEM;
goto out_destroy_fileinfo_put_wq;
}
deferredclose_wq = alloc_workqueue("deferredclose",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!deferredclose_wq) {
rc = -ENOMEM;
goto out_destroy_cifsoplockd_wq;
}
serverclose_wq = alloc_workqueue("serverclose",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!serverclose_wq) {
rc = -ENOMEM;
goto out_destroy_deferredclose_wq;
}
cfid_put_wq = alloc_workqueue("cfid_put_wq",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!cfid_put_wq) {
rc = -ENOMEM;
goto out_destroy_serverclose_wq;
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 1e64a4fb6af0..3ac254e123dc 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -87,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 7
+#define MAX_COMPOUND 10
/*
* Default number of credits to keep available for SMB3.
@@ -814,6 +814,13 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
unsigned int min_offload;
+ /*
+ * If payload is less than or equal to the threshold,
+ * use RDMA send/recv to send upper layer I/O.
+ * If payload is more than the threshold,
+ * use RDMA read/write through memory registration for I/O.
+ */
+ unsigned int rdma_readwrite_threshold;
unsigned int retrans;
struct {
bool requested; /* "compress" mount option set*/
@@ -1540,7 +1547,7 @@ struct cifs_io_subrequest {
struct kvec iov[2];
struct TCP_Server_Info *server;
#ifdef CONFIG_CIFS_SMB_DIRECT
- struct smbd_mr *mr;
+ struct smbdirect_mr_io *mr;
#endif
struct cifs_credits credits;
};
@@ -1882,9 +1889,12 @@ static inline bool is_replayable_error(int error)
/* cifs_get_writable_file() flags */
-#define FIND_WR_ANY 0
-#define FIND_WR_FSUID_ONLY 1
-#define FIND_WR_WITH_DELETE 2
+enum cifs_writable_file_flags {
+ FIND_WR_ANY = 0U,
+ FIND_WR_FSUID_ONLY = (1U << 0),
+ FIND_WR_WITH_DELETE = (1U << 1),
+ FIND_WR_NO_PENDING_DELETE = (1U << 2),
+};
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
@@ -2343,6 +2353,8 @@ struct smb2_compound_vars {
struct kvec qi_iov;
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE];
struct kvec close_iov;
struct smb2_file_rename_info_hdr rename_info;
struct smb2_file_link_info_hdr link_info;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c34c533b2efa..e8fba98690ce 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
-extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
- const char *path);
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
+ struct dentry *dentry);
extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
const char *path);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 186e061068be..a5ed742afa00 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -97,8 +97,12 @@ retry:
cifs_trace_rw_credits_write_prepare);
#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- stream->sreq_max_segs = server->smbd_conn->max_frmr_depth;
+ if (server->smbd_conn) {
+ const struct smbdirect_socket_parameters *sp =
+ smbd_get_parameters(server->smbd_conn);
+
+ stream->sreq_max_segs = sp->max_frmr_depth;
+ }
#endif
}
@@ -187,8 +191,12 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
cifs_trace_rw_credits_read_submit);
#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->smbd_conn)
- rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth;
+ if (server->smbd_conn) {
+ const struct smbdirect_socket_parameters *sp =
+ smbd_get_parameters(server->smbd_conn);
+
+ rreq->io_streams[0].sreq_max_segs = sp->max_frmr_depth;
+ }
#endif
return 0;
}
@@ -998,7 +1006,10 @@ int cifs_open(struct inode *inode, struct file *file)
/* Get the cached handle as SMB2 close is deferred */
if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) {
- rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile);
+ rc = cifs_get_writable_path(tcon, full_path,
+ FIND_WR_FSUID_ONLY |
+ FIND_WR_NO_PENDING_DELETE,
+ &cfile);
} else {
rc = cifs_get_readable_path(tcon, full_path, &cfile);
}
@@ -2530,6 +2541,9 @@ refind_writable:
continue;
if (with_delete && !(open_file->fid.access & DELETE))
continue;
+ if ((flags & FIND_WR_NO_PENDING_DELETE) &&
+ open_file->status_file_deleted)
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2647,6 +2661,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
spin_unlock(&tcon->open_file_lock);
free_dentry_path(page);
*ret_file = find_readable_file(cinode, 0);
+ if (*ret_file) {
+ spin_lock(&cinode->open_file_lock);
+ if ((*ret_file)->status_file_deleted) {
+ spin_unlock(&cinode->open_file_lock);
+ cifsFileInfo_put(*ret_file);
+ *ret_file = NULL;
+ } else {
+ spin_unlock(&cinode->open_file_lock);
+ }
+ }
return *ret_file ? 0 : -ENOENT;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index fe453a4b3dc8..7e9784080501 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode)
* but will return the EACCES to the caller. Note that the VFS does not call
* unlink on negative dentries currently.
*/
-int cifs_unlink(struct inode *dir, struct dentry *dentry)
+static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename)
{
int rc = 0;
unsigned int xid;
@@ -1984,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
}
netfs_wait_for_outstanding_io(inode);
- cifs_close_deferred_file_under_dentry(tcon, full_path);
+ cifs_close_deferred_file_under_dentry(tcon, dentry);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -2003,7 +2003,24 @@ retry_std_delete:
goto psx_del_no_retry;
}
- rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
+ /* For SMB2+, if the file is open, we always perform a silly rename.
+ *
+ * We check for d_count() right after calling
+ * cifs_close_deferred_file_under_dentry() to make sure that the
+ * dentry's refcount gets dropped in case the file had any deferred
+ * close.
+ */
+ if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) {
+ spin_lock(&dentry->d_lock);
+ if (d_count(dentry) > 1)
+ sillyrename = true;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ if (sillyrename)
+ rc = -EBUSY;
+ else
+ rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
psx_del_no_retry:
if (!rc) {
@@ -2071,6 +2088,11 @@ unlink_out:
return rc;
}
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return __cifs_unlink(dir, dentry, false);
+}
+
static int
cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -2358,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
cifs_put_tlink(tlink);
+ cifsInode = CIFS_I(d_inode(direntry));
+
if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
spin_lock(&d_inode(direntry)->i_lock);
i_size_write(d_inode(direntry), 0);
clear_nlink(d_inode(direntry));
spin_unlock(&d_inode(direntry)->i_lock);
}
- cifsInode = CIFS_I(d_inode(direntry));
/* force revalidate to go get info when needed */
cifsInode->time = 0;
@@ -2458,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
- if (rc == 0)
+ if (rc == 0) {
d_move(from_dentry, to_dentry);
+ /* Force a new lookup */
+ d_drop(from_dentry);
+ }
cifs_put_tlink(tlink);
return rc;
}
@@ -2470,6 +2497,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
struct dentry *target_dentry, unsigned int flags)
{
const char *from_name, *to_name;
+ struct TCP_Server_Info *server;
void *page1, *page2;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
@@ -2505,6 +2533,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
page1 = alloc_dentry_path();
page2 = alloc_dentry_path();
@@ -2522,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
goto cifs_rename_exit;
}
- cifs_close_deferred_file_under_dentry(tcon, from_name);
+ cifs_close_deferred_file_under_dentry(tcon, source_dentry);
if (d_inode(target_dentry) != NULL) {
netfs_wait_for_outstanding_io(d_inode(target_dentry));
- cifs_close_deferred_file_under_dentry(tcon, to_name);
+ cifs_close_deferred_file_under_dentry(tcon, target_dentry);
}
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
@@ -2591,19 +2620,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
unlink_target:
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-
- /* Try unlinking the target dentry if it's not negative */
- if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
- if (d_is_dir(target_dentry))
- tmprc = cifs_rmdir(target_dir, target_dentry);
- else
- tmprc = cifs_unlink(target_dir, target_dentry);
- if (tmprc)
- goto cifs_rename_exit;
- rc = cifs_do_rename(xid, source_dentry, from_name,
- target_dentry, to_name);
- if (!rc)
- rehash = false;
+ if (d_really_is_positive(target_dentry)) {
+ if (!rc) {
+ struct inode *inode = d_inode(target_dentry);
+ /*
+ * Samba and ksmbd servers allow renaming a target
+ * directory that is open, so make sure to update
+ * ->i_nlink and then mark it as delete pending.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb);
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, 0);
+ clear_nlink(inode);
+ spin_unlock(&inode->i_lock);
+ set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags);
+ CIFS_I(inode)->time = 0; /* force reval */
+ inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ }
+ } else if (rc == -EACCES || rc == -EEXIST) {
+ /*
+ * Rename failed, possibly due to a busy target.
+ * Retry it by unliking the target first.
+ */
+ if (d_is_dir(target_dentry)) {
+ tmprc = cifs_rmdir(target_dir, target_dentry);
+ } else {
+ tmprc = __cifs_unlink(target_dir, target_dentry,
+ server->vals->protocol_id > SMB10_PROT_ID);
+ }
+ if (tmprc) {
+ /*
+ * Some servers will return STATUS_ACCESS_DENIED
+ * or STATUS_DIRECTORY_NOT_EMPTY when failing to
+ * rename a non-empty directory. Make sure to
+ * propagate the appropriate error back to
+ * userspace.
+ */
+ if (tmprc == -EEXIST || tmprc == -ENOTEMPTY)
+ rc = tmprc;
+ goto cifs_rename_exit;
+ }
+ rc = cifs_do_rename(xid, source_dentry, from_name,
+ target_dentry, to_name);
+ if (!rc)
+ rehash = false;
+ }
}
/* force revalidate to go get info when needed */
@@ -2629,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry)
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct cached_fid *cfid = NULL;
+ if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags))
+ return false;
if (cifs_i->time == 0)
return true;
@@ -2779,7 +2844,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
}
cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
- full_path, inode, inode->i_count.counter,
+ full_path, inode, icount_read(inode),
dentry, cifs_get_time(dentry), jiffies);
again:
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index da23cc12a52c..dda6dece802a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
kfree(tmp_list);
}
}
-void
-cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
+
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry)
{
- struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
- void *page;
- const char *full_path;
+ struct cifsFileInfo *cfile;
LIST_HEAD(file_head);
- page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
- full_path = build_path_from_dentry(cfile->dentry, page);
- if (strstr(full_path, path)) {
- if (delayed_work_pending(&cfile->deferred)) {
- if (cancel_delayed_work(&cfile->deferred)) {
- spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
- cifs_del_deferred_close(cfile);
- spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
-
- tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
- if (tmp_list == NULL)
- break;
- tmp_list->cfile = cfile;
- list_add_tail(&tmp_list->list, &file_head);
- }
- }
+ if ((cfile->dentry == dentry) &&
+ delayed_work_pending(&cfile->deferred) &&
+ cancel_delayed_work(&cfile->deferred)) {
+ spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+ cifs_del_deferred_close(cfile);
+ spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ break;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
}
}
spin_unlock(&tcon->open_file_lock);
@@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
list_del(&tmp_list->list);
kfree(tmp_list);
}
- free_dentry_path(page);
}
/*
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 7869cec58f52..10c84c095fe7 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
}
/*
- * For absolute symlinks it is not possible to determinate
+ * For absolute symlinks it is not possible to determine
* if it should point to directory or file.
*/
if (symname[0] == '/') {
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 893a1ea8c000..a02d41d1ce4a 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
rc = -EOPNOTSUPP;
}
- /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */
+ /* Fallback to SMB_COM_SETATTR command when absolutely needed. */
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n");
rc = SMBSetInformation(xid, tcon, full_path,
@@ -1039,7 +1039,7 @@ set_via_filehandle:
cifsFileInfo_put(open_file);
/*
- * Setting the read-only bit is not honered on non-NT servers when done
+ * Setting the read-only bit is not honored on non-NT servers when done
* via open-semantics. So for setting it, use SMB_COM_SETATTR command.
* This command works only after the file is closed, so use it only when
* operation was called without the filehandle.
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 224495322a05..e56e4d402f13 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -30,10 +30,9 @@ enum smb2_compound_ops {
SMB2_OP_QUERY_DIR,
SMB2_OP_MKDIR,
SMB2_OP_RENAME,
- SMB2_OP_DELETE,
SMB2_OP_HARDLINK,
SMB2_OP_SET_EOF,
- SMB2_OP_RMDIR,
+ SMB2_OP_UNLINK,
SMB2_OP_POSIX_QUERY_INFO,
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 31c13fb5b85b..0985db9f86e5 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -346,9 +346,6 @@ replay_again:
trace_smb3_posix_query_info_compound_enter(xid, tcon->tid,
ses->Suid, full_path);
break;
- case SMB2_OP_DELETE:
- trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path);
- break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
@@ -356,23 +353,40 @@ replay_again:
*/
trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path);
break;
- case SMB2_OP_RMDIR:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ case SMB2_OP_UNLINK:
+ rqst[num_rqst].rq_iov = vars->unlink_iov;
rqst[num_rqst].rq_nvec = 1;
size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
data[0] = &delete_pending[0];
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_DISPOSITION_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path);
+ }
+ num_rqst++;
+ trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path);
break;
case SMB2_OP_SET_EOF:
rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -442,7 +456,7 @@ replay_again:
ses->Suid, full_path);
break;
case SMB2_OP_RENAME:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_iov = vars->rename_iov;
rqst[num_rqst].rq_nvec = 2;
len = in_iov[i].iov_len;
@@ -673,7 +687,7 @@ finished:
}
for (i = 0; i < num_cmds; i++) {
- char *buf = rsp_iov[i + i].iov_base;
+ char *buf = rsp_iov[i + 1].iov_base;
if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER)
rc = server->ops->map_error(buf, false);
@@ -732,19 +746,6 @@ finished:
trace_smb3_posix_query_info_compound_done(xid, tcon->tid,
ses->Suid);
break;
- case SMB2_OP_DELETE:
- if (rc)
- trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc);
- else {
- /*
- * If dentry (hence, inode) is NULL, lease break is going to
- * take care of degrading leases on handles for deleted files.
- */
- if (inode)
- cifs_mark_open_handles_for_deleted_file(inode, full_path);
- trace_smb3_delete_done(xid, tcon->tid, ses->Suid);
- }
- break;
case SMB2_OP_MKDIR:
if (rc)
trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc);
@@ -765,11 +766,11 @@ finished:
trace_smb3_rename_done(xid, tcon->tid, ses->Suid);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
- case SMB2_OP_RMDIR:
- if (rc)
- trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc);
+ case SMB2_OP_UNLINK:
+ if (!rc)
+ trace_smb3_unlink_done(xid, tcon->tid, ses->Suid);
else
- trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid);
+ trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_EOF:
@@ -1166,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
return smb2_compound_op(xid, tcon, cifs_sb,
name, &oparms, NULL,
- &(int){SMB2_OP_RMDIR}, 1,
+ &(int){SMB2_OP_UNLINK}, 1,
NULL, NULL, NULL, NULL);
}
@@ -1174,21 +1175,107 @@ int
smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb, struct dentry *dentry)
{
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ __le16 *utf16_path __free(kfree) = NULL;
+ int retries = 0, cur_sleep = 1;
+ struct TCP_Server_Info *server;
struct cifs_open_parms oparms;
+ struct smb2_create_req *creq;
+ struct inode *inode = NULL;
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec close_iov;
+ int resp_buftype[2];
+ struct cifs_fid fid;
+ int flags = 0;
+ __u8 oplock;
+ int rc;
- oparms = CIFS_OPARMS(cifs_sb, tcon, name,
- DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE);
- int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, dentry);
- if (rc == -EINVAL) {
- cifs_dbg(FYI, "invalid lease key, resending request without lease");
- rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, NULL);
+ utf16_path = cifs_convert_path_to_utf16(name, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+again:
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(tcon->ses);
+
+ memset(rqst, 0, sizeof(rqst));
+ memset(resp_buftype, 0, sizeof(resp_buftype));
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = ARRAY_SIZE(open_iov);
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES,
+ FILE_OPEN, CREATE_DELETE_ON_CLOSE |
+ OPEN_REPARSE_POINT, ACL_NO_MODE);
+ oparms.fid = &fid;
+
+ if (dentry) {
+ inode = d_inode(dentry);
+ if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) {
+ oplock = SMB2_OPLOCK_LEVEL_LEASE;
+ server->ops->get_lease_key(inode, &fid);
+ }
}
+
+ rc = SMB2_open_init(tcon, server,
+ &rqst[0], &oplock, &oparms, utf16_path);
+ if (rc)
+ goto err_free;
+ smb2_set_next_command(tcon, &rqst[0]);
+ creq = rqst[0].rq_iov[0].iov_base;
+ creq->ShareAccess = FILE_SHARE_DELETE_LE;
+
+ rqst[1].rq_iov = &close_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_close_init(tcon, server, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID, false);
+ smb2_set_related(&rqst[1]);
+ if (rc)
+ goto err_free;
+
+ if (retries) {
+ for (int i = 0; i < ARRAY_SIZE(rqst); i++)
+ smb2_set_replay(server, &rqst[i]);
+ }
+
+ rc = compound_send_recv(xid, tcon->ses, server, flags,
+ ARRAY_SIZE(rqst), rqst,
+ resp_buftype, rsp_iov);
+ SMB2_open_free(&rqst[0]);
+ SMB2_close_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto again;
+
+ /* Retry compound request without lease */
+ if (rc == -EINVAL && dentry) {
+ dentry = NULL;
+ retries = 0;
+ cur_sleep = 1;
+ goto again;
+ }
+ /*
+ * If dentry (hence, inode) is NULL, lease break is going to
+ * take care of degrading leases on handles for deleted files.
+ */
+ if (!rc && inode)
+ cifs_mark_open_handles_for_deleted_file(inode, name);
+
+ return rc;
+
+err_free:
+ SMB2_open_free(&rqst[0]);
+ SMB2_close_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -1441,3 +1528,113 @@ out:
cifs_free_open_info(&data);
return rc;
}
+
+static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb,
+ const char *name, size_t namelen)
+{
+ int len;
+
+ if (*name == '\\' ||
+ (cifs_sb_master_tlink(cifs_sb) &&
+ cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/'))
+ name++;
+ return cifs_strndup_to_utf16(name, namelen, &len,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+}
+
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry));
+ __le16 *utf16_path __free(kfree) = NULL;
+ __u32 co = file_create_options(dentry);
+ int cmds[] = {
+ SMB2_OP_SET_INFO,
+ SMB2_OP_RENAME,
+ SMB2_OP_UNLINK,
+ };
+ const int num_cmds = ARRAY_SIZE(cmds);
+ char *to_name __free(kfree) = NULL;
+ __u32 attrs = cinode->cifsAttrs;
+ struct cifs_open_parms oparms;
+ static atomic_t sillycounter;
+ struct cifsFileInfo *cfile;
+ struct tcon_link *tlink;
+ struct cifs_tcon *tcon;
+ struct kvec iov[2];
+ const char *ppath;
+ void *page;
+ size_t len;
+ int rc;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+
+ page = alloc_dentry_path();
+
+ ppath = build_path_from_dentry(dentry->d_parent, page);
+ if (IS_ERR(ppath)) {
+ rc = PTR_ERR(ppath);
+ goto out;
+ }
+
+ len = strlen(ppath) + strlen("/.__smb1234") + 1;
+ to_name = kmalloc(len, GFP_KERNEL);
+ if (!to_name) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb),
+ atomic_inc_return(&sillycounter) & 0xffff);
+
+ utf16_path = utf16_smb2_path(cifs_sb, to_name, len);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ DELETE | FILE_WRITE_ATTRIBUTES,
+ FILE_OPEN, co, ACL_NO_MODE);
+
+ attrs &= ~ATTR_READONLY;
+ if (!attrs)
+ attrs = ATTR_NORMAL;
+ if (d_inode(dentry)->i_nlink <= 1)
+ attrs |= ATTR_HIDDEN;
+ iov[0].iov_base = &(FILE_BASIC_INFO) {
+ .Attributes = cpu_to_le32(attrs),
+ };
+ iov[0].iov_len = sizeof(FILE_BASIC_INFO);
+ iov[1].iov_base = utf16_path;
+ iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path);
+
+ cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease\n");
+ cifs_get_writable_path(tcon, full_path,
+ FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, NULL);
+ }
+ if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags);
+ } else {
+ cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n",
+ __func__, full_path, to_name, rc);
+ rc = -EIO;
+ }
+out:
+ cifs_put_tlink(tlink);
+ free_dentry_path(page);
+ return rc;
+}
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index cddf273c14ae..89d933b4a8bc 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
struct cifs_pending_open *open;
+ /* Trace receipt of lease break request from server */
+ trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState),
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
+
cifs_dbg(FYI, "Checking for lease break\n");
/* If server is a channel, select the primary channel */
@@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
- le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
- le64_to_cpu(rsp->hdr.SessionId),
- *((u64 *)rsp->LeaseKey),
- *((u64 *)&rsp->LeaseKey[8]));
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
return false;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 94b1d7a395d5..4711a23c5b38 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -504,8 +504,8 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
- struct smbdirect_socket_parameters *sp =
- &server->smbd_conn->socket.parameters;
+ const struct smbdirect_socket_parameters *sp =
+ smbd_get_parameters(server->smbd_conn);
if (server->sign)
/*
@@ -555,8 +555,8 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
- struct smbdirect_socket_parameters *sp =
- &server->smbd_conn->socket.parameters;
+ const struct smbdirect_socket_parameters *sp =
+ smbd_get_parameters(server->smbd_conn);
if (server->sign)
/*
@@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/* SMB headers in a compound are 8 byte aligned. */
- if (!IS_ALIGNED(len, 8)) {
- num_padding = 8 - (len & 7);
+ if (IS_ALIGNED(len, 8))
+ goto out;
+
+ num_padding = 8 - (len & 7);
+ if (smb3_encryption_required(tcon)) {
+ int i;
+
+ /*
+ * Flatten request into a single buffer with required padding as
+ * the encryption layer can't handle the padding iovs.
+ */
+ for (i = 1; i < rqst->rq_nvec; i++) {
+ memcpy(rqst->rq_iov[0].iov_base +
+ rqst->rq_iov[0].iov_len,
+ rqst->rq_iov[i].iov_base,
+ rqst->rq_iov[i].iov_len);
+ rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+ }
+ memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+ 0, num_padding);
+ rqst->rq_iov[0].iov_len += num_padding;
+ rqst->rq_nvec = 1;
+ } else {
rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
rqst->rq_nvec++;
- len += num_padding;
}
+ len += num_padding;
+out:
shdr->NextCommand = cpu_to_le32(len);
}
@@ -5376,6 +5398,7 @@ struct smb_version_operations smb20_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
@@ -5481,6 +5504,7 @@ struct smb_version_operations smb21_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb30_operations = {
@@ -5597,6 +5621,7 @@ struct smb_version_operations smb30_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb311_operations = {
@@ -5713,6 +5738,7 @@ struct smb_version_operations smb311_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2df93a75e3b8..1c63d2c9cc9c 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4411,7 +4411,7 @@ static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms)
return false;
/* offload also has its overhead, so only do it if desired */
- if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold)
+ if (io_parms->length < server->rdma_readwrite_threshold)
return false;
return true;
@@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
please_key_high = (__u64 *)(lease_key+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
- trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high, rc);
cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
} else
- trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high);
return rc;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 6e805ece6a7b..b3f1398c9f79 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end);
int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid);
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 02d6db431fd4..316f398c70f4 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -13,28 +13,35 @@
#include "cifsproto.h"
#include "smb2proto.h"
+const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn)
+{
+ struct smbdirect_socket *sc = &conn->socket;
+
+ return &sc->parameters;
+}
+
static struct smbdirect_recv_io *get_receive_buffer(
- struct smbd_connection *info);
+ struct smbdirect_socket *sc);
static void put_receive_buffer(
- struct smbd_connection *info,
+ struct smbdirect_socket *sc,
struct smbdirect_recv_io *response);
-static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
-static void destroy_receive_buffers(struct smbd_connection *info);
+static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf);
+static void destroy_receive_buffers(struct smbdirect_socket *sc);
static void enqueue_reassembly(
- struct smbd_connection *info,
+ struct smbdirect_socket *sc,
struct smbdirect_recv_io *response, int data_length);
static struct smbdirect_recv_io *_get_first_reassembly(
- struct smbd_connection *info);
+ struct smbdirect_socket *sc);
static int smbd_post_recv(
- struct smbd_connection *info,
+ struct smbdirect_socket *sc,
struct smbdirect_recv_io *response);
-static int smbd_post_send_empty(struct smbd_connection *info);
+static int smbd_post_send_empty(struct smbdirect_socket *sc);
-static void destroy_mr_list(struct smbd_connection *info);
-static int allocate_mr_list(struct smbd_connection *info);
+static void destroy_mr_list(struct smbdirect_socket *sc);
+static int allocate_mr_list(struct smbdirect_socket *sc);
struct smb_extract_to_rdma {
struct ib_sge *sge;
@@ -57,6 +64,9 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
/* SMBD negotiation timeout in seconds */
#define SMBD_NEGOTIATE_TIMEOUT 120
+/* The timeout to wait for a keepalive message from peer in seconds */
+#define KEEPALIVE_RECV_TIMEOUT 5
+
/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
#define SMBD_MIN_RECEIVE_SIZE 128
#define SMBD_MIN_FRAGMENTED_SIZE 131072
@@ -155,65 +165,277 @@ do { \
#define log_rdma_mr(level, fmt, args...) \
log_rdma(level, LOG_RDMA_MR, fmt, ##args)
+static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc)
+{
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.credits.wait_queue);
+ wake_up_all(&sc->send_io.pending.dec_wait_queue);
+ wake_up_all(&sc->send_io.pending.zero_wait_queue);
+ wake_up_all(&sc->recv_io.reassembly.wait_queue);
+ wake_up_all(&sc->mr_io.ready.wait_queue);
+ wake_up_all(&sc->mr_io.cleanup.wait_queue);
+}
+
static void smbd_disconnect_rdma_work(struct work_struct *work)
{
- struct smbd_connection *info =
- container_of(work, struct smbd_connection, disconnect_work);
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, disconnect_work);
- if (sc->status == SMBDIRECT_SOCKET_CONNECTED) {
+ /*
+ * make sure this and other work is not queued again
+ * but here we don't block and avoid
+ * disable[_delayed]_work_sync()
+ */
+ disable_work(&sc->disconnect_work);
+ disable_work(&sc->recv_io.posted.refill_work);
+ disable_work(&sc->mr_io.recovery_work);
+ disable_work(&sc->idle.immediate_work);
+ disable_delayed_work(&sc->idle.timer_work);
+
+ if (sc->first_error == 0)
+ sc->first_error = -ECONNABORTED;
+
+ switch (sc->status) {
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ case SMBDIRECT_SOCKET_CONNECTED:
+ case SMBDIRECT_SOCKET_ERROR:
sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
rdma_disconnect(sc->rdma.cm_id);
+ break;
+
+ case SMBDIRECT_SOCKET_CREATED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ /*
+ * rdma_connect() never reached
+ * RDMA_CM_EVENT_ESTABLISHED
+ */
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ case SMBDIRECT_SOCKET_DESTROYED:
+ break;
}
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ smbd_disconnect_wake_up_all(sc);
}
-static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
+static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
{
- queue_work(info->workqueue, &info->disconnect_work);
+ /*
+ * make sure other work (than disconnect_work) is
+ * not queued again but here we don't block and avoid
+ * disable[_delayed]_work_sync()
+ */
+ disable_work(&sc->recv_io.posted.refill_work);
+ disable_work(&sc->mr_io.recovery_work);
+ disable_work(&sc->idle.immediate_work);
+ disable_delayed_work(&sc->idle.timer_work);
+
+ if (sc->first_error == 0)
+ sc->first_error = -ECONNABORTED;
+
+ switch (sc->status) {
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ case SMBDIRECT_SOCKET_ERROR:
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ case SMBDIRECT_SOCKET_DESTROYED:
+ /*
+ * Keep the current error status
+ */
+ break;
+
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_CREATED:
+ case SMBDIRECT_SOCKET_CONNECTED:
+ sc->status = SMBDIRECT_SOCKET_ERROR;
+ break;
+ }
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ smbd_disconnect_wake_up_all(sc);
+
+ queue_work(sc->workqueue, &sc->disconnect_work);
}
/* Upcall from RDMA CM */
static int smbd_conn_upcall(
struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct smbd_connection *info = id->context;
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket *sc = id->context;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
const char *event_name = rdma_event_msg(event->event);
+ u8 peer_initiator_depth;
+ u8 peer_responder_resources;
log_rdma_event(INFO, "event=%s status=%d\n",
event_name, event->status);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
+ wake_up(&sc->status_wait);
+ break;
+
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- info->ri_rc = 0;
- complete(&info->ri_done);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
+ wake_up(&sc->status_wait);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
- info->ri_rc = -EHOSTUNREACH;
- complete(&info->ri_done);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+ smbd_disconnect_rdma_work(&sc->disconnect_work);
break;
case RDMA_CM_EVENT_ROUTE_ERROR:
log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
- info->ri_rc = -ENETUNREACH;
- complete(&info->ri_done);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+ smbd_disconnect_rdma_work(&sc->disconnect_work);
break;
case RDMA_CM_EVENT_ESTABLISHED:
log_rdma_event(INFO, "connected event=%s\n", event_name);
- sc->status = SMBDIRECT_SOCKET_CONNECTED;
- wake_up_interruptible(&info->status_wait);
+
+ /*
+ * Here we work around an inconsistency between
+ * iWarp and other devices (at least rxe and irdma using RoCEv2)
+ */
+ if (rdma_protocol_iwarp(id->device, id->port_num)) {
+ /*
+ * iWarp devices report the peer's values
+ * with the perspective of the peer here.
+ * Tested with siw and irdma (in iwarp mode)
+ * We need to change to our perspective here,
+ * so we need to switch the values.
+ */
+ peer_initiator_depth = event->param.conn.responder_resources;
+ peer_responder_resources = event->param.conn.initiator_depth;
+ } else {
+ /*
+ * Non iWarp devices report the peer's values
+ * already changed to our perspective here.
+ * Tested with rxe and irdma (in roce mode).
+ */
+ peer_initiator_depth = event->param.conn.initiator_depth;
+ peer_responder_resources = event->param.conn.responder_resources;
+ }
+ if (rdma_protocol_iwarp(id->device, id->port_num) &&
+ event->param.conn.private_data_len == 8) {
+ /*
+ * Legacy clients with only iWarp MPA v1 support
+ * need a private blob in order to negotiate
+ * the IRD/ORD values.
+ */
+ const __be32 *ird_ord_hdr = event->param.conn.private_data;
+ u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
+ u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
+
+ /*
+ * cifs.ko sends the legacy IRD/ORD negotiation
+ * event if iWarp MPA v2 was used.
+ *
+ * Here we check that the values match and only
+ * mark the client as legacy if they don't match.
+ */
+ if ((u32)event->param.conn.initiator_depth != ird32 ||
+ (u32)event->param.conn.responder_resources != ord32) {
+ /*
+ * There are broken clients (old cifs.ko)
+ * using little endian and also
+ * struct rdma_conn_param only uses u8
+ * for initiator_depth and responder_resources,
+ * so we truncate the value to U8_MAX.
+ *
+ * smb_direct_accept_client() will then
+ * do the real negotiation in order to
+ * select the minimum between client and
+ * server.
+ */
+ ird32 = min_t(u32, ird32, U8_MAX);
+ ord32 = min_t(u32, ord32, U8_MAX);
+
+ sc->rdma.legacy_iwarp = true;
+ peer_initiator_depth = (u8)ird32;
+ peer_responder_resources = (u8)ord32;
+ }
+ }
+
+ /*
+ * negotiate the value by using the minimum
+ * between client and server if the client provided
+ * non 0 values.
+ */
+ if (peer_initiator_depth != 0)
+ sp->initiator_depth =
+ min_t(u8, sp->initiator_depth,
+ peer_initiator_depth);
+ if (peer_responder_resources != 0)
+ sp->responder_resources =
+ min_t(u8, sp->responder_resources,
+ peer_responder_resources);
+
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
+ wake_up(&sc->status_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- wake_up_interruptible(&info->status_wait);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+ smbd_disconnect_rdma_work(&sc->disconnect_work);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
@@ -221,15 +443,10 @@ static int smbd_conn_upcall(
/* This happens when we fail the negotiation */
if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
- sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- wake_up(&info->status_wait);
- break;
}
sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
- wake_up_interruptible(&info->status_wait);
- wake_up_interruptible(&sc->recv_io.reassembly.wait_queue);
- wake_up_interruptible_all(&info->wait_send_queue);
+ smbd_disconnect_rdma_work(&sc->disconnect_work);
break;
default:
@@ -245,15 +462,15 @@ static int smbd_conn_upcall(
static void
smbd_qp_async_error_upcall(struct ib_event *event, void *context)
{
- struct smbd_connection *info = context;
+ struct smbdirect_socket *sc = context;
- log_rdma_event(ERR, "%s on device %s info %p\n",
- ib_event_msg(event->event), event->device->name, info);
+ log_rdma_event(ERR, "%s on device %s socket %p\n",
+ ib_event_msg(event->event), event->device->name, sc);
switch (event->event) {
case IB_EVENT_CQ_ERR:
case IB_EVENT_QP_FATAL:
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
break;
default:
@@ -278,11 +495,9 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_send_io *request =
container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
struct smbdirect_socket *sc = request->socket;
- struct smbd_connection *info =
- container_of(sc, struct smbd_connection, socket);
- log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%d\n",
- request, wc->status);
+ log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n",
+ request, ib_wc_status_msg(wc->status));
for (i = 0; i < request->num_sge; i++)
ib_dma_unmap_single(sc->ib.dev,
@@ -291,17 +506,18 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
DMA_TO_DEVICE);
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
- log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
- wc->status, wc->opcode);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->opcode);
mempool_free(request, sc->send_io.mem.pool);
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
return;
}
- if (atomic_dec_and_test(&info->send_pending))
- wake_up(&info->wait_send_pending);
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
- wake_up(&info->wait_post_send);
+ wake_up(&sc->send_io.pending.dec_wait_queue);
mempool_free(request, sc->send_io.mem.pool);
}
@@ -325,8 +541,6 @@ static bool process_negotiation_response(
struct smbdirect_recv_io *response, int packet_length)
{
struct smbdirect_socket *sc = response->socket;
- struct smbd_connection *info =
- container_of(sc, struct smbd_connection, socket);
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
@@ -341,21 +555,19 @@ static bool process_negotiation_response(
le16_to_cpu(packet->negotiated_version));
return false;
}
- info->protocol = le16_to_cpu(packet->negotiated_version);
if (packet->credits_requested == 0) {
log_rdma_event(ERR, "error: credits_requested==0\n");
return false;
}
- info->receive_credit_target = le16_to_cpu(packet->credits_requested);
+ sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested);
+ sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
if (packet->credits_granted == 0) {
log_rdma_event(ERR, "error: credits_granted==0\n");
return false;
}
- atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
-
- atomic_set(&info->receive_credits, 0);
+ atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted));
if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
log_rdma_event(ERR, "error: preferred_send_size=%d\n",
@@ -380,16 +592,12 @@ static bool process_negotiation_response(
}
sp->max_fragmented_send_size =
le32_to_cpu(packet->max_fragmented_size);
- info->rdma_readwrite_threshold =
- rdma_readwrite_threshold > sp->max_fragmented_send_size ?
- sp->max_fragmented_send_size :
- rdma_readwrite_threshold;
sp->max_read_write_size = min_t(u32,
le32_to_cpu(packet->max_readwrite_size),
- info->max_frmr_depth * PAGE_SIZE);
- info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
+ sp->max_frmr_depth * PAGE_SIZE);
+ sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
return true;
@@ -397,52 +605,40 @@ static bool process_negotiation_response(
static void smbd_post_send_credits(struct work_struct *work)
{
- int ret = 0;
int rc;
struct smbdirect_recv_io *response;
- struct smbd_connection *info =
- container_of(work, struct smbd_connection,
- post_send_credits_work);
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
- wake_up(&info->wait_receive_queues);
return;
}
- if (info->receive_credit_target >
- atomic_read(&info->receive_credits)) {
+ if (sc->recv_io.credits.target >
+ atomic_read(&sc->recv_io.credits.count)) {
while (true) {
- response = get_receive_buffer(info);
+ response = get_receive_buffer(sc);
if (!response)
break;
response->first_segment = false;
- rc = smbd_post_recv(info, response);
+ rc = smbd_post_recv(sc, response);
if (rc) {
log_rdma_recv(ERR,
"post_recv failed rc=%d\n", rc);
- put_receive_buffer(info, response);
+ put_receive_buffer(sc, response);
break;
}
- ret++;
+ atomic_inc(&sc->recv_io.posted.count);
}
}
- spin_lock(&info->lock_new_credits_offered);
- info->new_credits_offered += ret;
- spin_unlock(&info->lock_new_credits_offered);
-
/* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
- info->send_immediate = true;
- if (atomic_read(&info->receive_credits) <
- info->receive_credit_target - 1) {
- if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
- info->send_immediate) {
- log_keep_alive(INFO, "send an empty message\n");
- smbd_post_send_empty(info);
- }
+ if (atomic_read(&sc->recv_io.credits.count) <
+ sc->recv_io.credits.target - 1) {
+ log_keep_alive(INFO, "schedule send of an empty message\n");
+ queue_work(sc->workqueue, &sc->idle.immediate_work);
}
}
@@ -453,17 +649,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_recv_io *response =
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = response->socket;
- struct smbd_connection *info =
- container_of(sc, struct smbd_connection, socket);
- int data_length = 0;
-
- log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
- response, sc->recv_io.expected, wc->status, wc->opcode,
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ u16 old_recv_credit_target;
+ u32 data_offset = 0;
+ u32 data_length = 0;
+ u32 remaining_data_length = 0;
+ bool negotiate_done = false;
+
+ log_rdma_recv(INFO,
+ "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n",
+ response, sc->recv_io.expected,
+ ib_wc_status_msg(wc->status), wc->opcode,
wc->byte_len, wc->pkey_index);
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
- log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
- wc->status, wc->opcode);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ log_rdma_recv(ERR, "wc->status=%s opcode=%d\n",
+ ib_wc_status_msg(wc->status), wc->opcode);
goto error;
}
@@ -473,21 +675,52 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
response->sge.length,
DMA_FROM_DEVICE);
+ /*
+ * Reset timer to the keepalive interval in
+ * order to trigger our next keepalive message.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_interval_msec));
+
switch (sc->recv_io.expected) {
/* SMBD negotiation response */
case SMBDIRECT_EXPECT_NEGOTIATE_REP:
dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
sc->recv_io.reassembly.full_packet_received = true;
- info->negotiate_done =
+ negotiate_done =
process_negotiation_response(response, wc->byte_len);
- put_receive_buffer(info, response);
- complete(&info->negotiate_completion);
+ put_receive_buffer(sc, response);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING);
+ if (!negotiate_done) {
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+ smbd_disconnect_rdma_connection(sc);
+ } else {
+ sc->status = SMBDIRECT_SOCKET_CONNECTED;
+ wake_up(&sc->status_wait);
+ }
+
return;
/* SMBD data transfer packet */
case SMBDIRECT_EXPECT_DATA_TRANSFER:
data_transfer = smbdirect_recv_io_payload(response);
+
+ if (wc->byte_len <
+ offsetof(struct smbdirect_data_transfer, padding))
+ goto error;
+
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
data_length = le32_to_cpu(data_transfer->data_length);
+ if (wc->byte_len < data_offset ||
+ (u64)wc->byte_len < (u64)data_offset + data_length)
+ goto error;
+
+ if (remaining_data_length > sp->max_fragmented_recv_size ||
+ data_length > sp->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
+ goto error;
if (data_length) {
if (sc->recv_io.reassembly.full_packet_received)
@@ -499,17 +732,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
sc->recv_io.reassembly.full_packet_received = true;
}
- atomic_dec(&info->receive_credits);
- info->receive_credit_target =
+ atomic_dec(&sc->recv_io.posted.count);
+ atomic_dec(&sc->recv_io.credits.count);
+ old_recv_credit_target = sc->recv_io.credits.target;
+ sc->recv_io.credits.target =
le16_to_cpu(data_transfer->credits_requested);
+ sc->recv_io.credits.target =
+ min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
+ sc->recv_io.credits.target =
+ max_t(u16, sc->recv_io.credits.target, 1);
if (le16_to_cpu(data_transfer->credits_granted)) {
atomic_add(le16_to_cpu(data_transfer->credits_granted),
- &info->send_credits);
+ &sc->send_io.credits.count);
/*
* We have new send credits granted from remote peer
* If any sender is waiting for credits, unblock it
*/
- wake_up_interruptible(&info->wait_send_queue);
+ wake_up(&sc->send_io.credits.wait_queue);
}
log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
@@ -518,11 +757,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
le32_to_cpu(data_transfer->data_length),
le32_to_cpu(data_transfer->remaining_data_length));
- /* Send a KEEP_ALIVE response right away if requested */
- info->keep_alive_requested = KEEP_ALIVE_NONE;
+ /* Send an immediate response right away if requested */
if (le16_to_cpu(data_transfer->flags) &
SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
- info->keep_alive_requested = KEEP_ALIVE_PENDING;
+ log_keep_alive(INFO, "schedule send of immediate response\n");
+ queue_work(sc->workqueue, &sc->idle.immediate_work);
}
/*
@@ -530,10 +769,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
* reassembly queue and wake up the reading thread
*/
if (data_length) {
- enqueue_reassembly(info, response, data_length);
- wake_up_interruptible(&sc->recv_io.reassembly.wait_queue);
+ if (sc->recv_io.credits.target > old_recv_credit_target)
+ queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
+
+ enqueue_reassembly(sc, response, data_length);
+ wake_up(&sc->recv_io.reassembly.wait_queue);
} else
- put_receive_buffer(info, response);
+ put_receive_buffer(sc, response);
return;
@@ -548,19 +790,20 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
error:
- put_receive_buffer(info, response);
- smbd_disconnect_rdma_connection(info);
+ put_receive_buffer(sc, response);
+ smbd_disconnect_rdma_connection(sc);
}
static struct rdma_cm_id *smbd_create_id(
- struct smbd_connection *info,
+ struct smbdirect_socket *sc,
struct sockaddr *dstaddr, int port)
{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct rdma_cm_id *id;
int rc;
__be16 *sport;
- id = rdma_create_id(&init_net, smbd_conn_upcall, info,
+ id = rdma_create_id(&init_net, smbd_conn_upcall, sc,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
@@ -575,43 +818,57 @@ static struct rdma_cm_id *smbd_create_id(
*sport = htons(port);
- init_completion(&info->ri_done);
- info->ri_rc = -ETIMEDOUT;
-
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
- RDMA_RESOLVE_TIMEOUT);
+ sp->resolve_addr_timeout_msec);
if (rc) {
log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
goto out;
}
- rc = wait_for_completion_interruptible_timeout(
- &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ rc = wait_event_interruptible_timeout(
+ sc->status_wait,
+ sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
+ msecs_to_jiffies(sp->resolve_addr_timeout_msec));
/* e.g. if interrupted returns -ERESTARTSYS */
if (rc < 0) {
log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
goto out;
}
- rc = info->ri_rc;
- if (rc) {
+ if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) {
+ rc = -ETIMEDOUT;
+ log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
+ goto out;
+ }
+ if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) {
+ rc = -EHOSTUNREACH;
log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
goto out;
}
- info->ri_rc = -ETIMEDOUT;
- rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
+ rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec);
if (rc) {
log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
goto out;
}
- rc = wait_for_completion_interruptible_timeout(
- &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ rc = wait_event_interruptible_timeout(
+ sc->status_wait,
+ sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
+ msecs_to_jiffies(sp->resolve_route_timeout_msec));
/* e.g. if interrupted returns -ERESTARTSYS */
if (rc < 0) {
log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
goto out;
}
- rc = info->ri_rc;
- if (rc) {
+ if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) {
+ rc = -ETIMEDOUT;
+ log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
+ goto out;
+ }
+ if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) {
+ rc = -ENETUNREACH;
log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
goto out;
}
@@ -638,13 +895,16 @@ static bool frwr_is_supported(struct ib_device_attr *attrs)
}
static int smbd_ia_open(
- struct smbd_connection *info,
+ struct smbdirect_socket *sc,
struct sockaddr *dstaddr, int port)
{
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
int rc;
- sc->rdma.cm_id = smbd_create_id(info, dstaddr, port);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
+
+ sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port);
if (IS_ERR(sc->rdma.cm_id)) {
rc = PTR_ERR(sc->rdma.cm_id);
goto out1;
@@ -659,19 +919,12 @@ static int smbd_ia_open(
rc = -EPROTONOSUPPORT;
goto out2;
}
- info->max_frmr_depth = min_t(int,
- smbd_max_frmr_depth,
+ sp->max_frmr_depth = min_t(u32,
+ sp->max_frmr_depth,
sc->ib.dev->attrs.max_fast_reg_page_list_len);
- info->mr_type = IB_MR_TYPE_MEM_REG;
+ sc->mr_io.type = IB_MR_TYPE_MEM_REG;
if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
- info->mr_type = IB_MR_TYPE_SG_GAPS;
-
- sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
- if (IS_ERR(sc->ib.pd)) {
- rc = PTR_ERR(sc->ib.pd);
- log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
- goto out2;
- }
+ sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
return 0;
@@ -689,9 +942,8 @@ out1:
* After negotiation, the transport is connected and ready for
* carrying upper layer SMB payload
*/
-static int smbd_post_send_negotiate_req(struct smbd_connection *info)
+static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_send_wr send_wr;
int rc = -ENOMEM;
@@ -743,18 +995,18 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].addr,
request->sge[0].length, request->sge[0].lkey);
- atomic_inc(&info->send_pending);
+ atomic_inc(&sc->send_io.pending.count);
rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
if (!rc)
return 0;
/* if we reach here, post send failed */
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- atomic_dec(&info->send_pending);
+ atomic_dec(&sc->send_io.pending.count);
ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
request->sge[0].length, DMA_TO_DEVICE);
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
dma_mapping_failed:
mempool_free(request, sc->send_io.mem.pool);
@@ -769,14 +1021,20 @@ dma_mapping_failed:
* buffer as possible, and extend the receive credits to remote peer
* return value: the new credtis being granted.
*/
-static int manage_credits_prior_sending(struct smbd_connection *info)
+static int manage_credits_prior_sending(struct smbdirect_socket *sc)
{
int new_credits;
- spin_lock(&info->lock_new_credits_offered);
- new_credits = info->new_credits_offered;
- info->new_credits_offered = 0;
- spin_unlock(&info->lock_new_credits_offered);
+ if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
+ return 0;
+
+ new_credits = atomic_read(&sc->recv_io.posted.count);
+ if (new_credits == 0)
+ return 0;
+
+ new_credits -= atomic_read(&sc->recv_io.credits.count);
+ if (new_credits <= 0)
+ return 0;
return new_credits;
}
@@ -790,21 +1048,27 @@ static int manage_credits_prior_sending(struct smbd_connection *info)
* 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
* 0: otherwise
*/
-static int manage_keep_alive_before_sending(struct smbd_connection *info)
+static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
{
- if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
- info->keep_alive_requested = KEEP_ALIVE_SENT;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+ if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
+ /*
+ * Now use the keepalive timeout (instead of keepalive interval)
+ * in order to wait for a response
+ */
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_timeout_msec));
return 1;
}
return 0;
}
/* Post the send request */
-static int smbd_post_send(struct smbd_connection *info,
+static int smbd_post_send(struct smbdirect_socket *sc,
struct smbdirect_send_io *request)
{
- struct smbdirect_socket *sc = &info->socket;
- struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_send_wr send_wr;
int rc, i;
@@ -831,21 +1095,17 @@ static int smbd_post_send(struct smbd_connection *info,
rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
if (rc) {
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
rc = -EAGAIN;
- } else
- /* Reset timer for idle connection after packet is sent */
- mod_delayed_work(info->workqueue, &info->idle_timer_work,
- msecs_to_jiffies(sp->keepalive_interval_msec));
+ }
return rc;
}
-static int smbd_post_send_iter(struct smbd_connection *info,
+static int smbd_post_send_iter(struct smbdirect_socket *sc,
struct iov_iter *iter,
int *_remaining_data_length)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_socket_parameters *sp = &sc->parameters;
int i, rc;
int header_length;
@@ -856,8 +1116,8 @@ static int smbd_post_send_iter(struct smbd_connection *info,
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
- rc = wait_event_interruptible(info->wait_send_queue,
- atomic_read(&info->send_credits) > 0 ||
+ rc = wait_event_interruptible(sc->send_io.credits.wait_queue,
+ atomic_read(&sc->send_io.credits.count) > 0 ||
sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc)
goto err_wait_credit;
@@ -867,14 +1127,14 @@ wait_credit:
rc = -EAGAIN;
goto err_wait_credit;
}
- if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
- atomic_inc(&info->send_credits);
+ if (unlikely(atomic_dec_return(&sc->send_io.credits.count) < 0)) {
+ atomic_inc(&sc->send_io.credits.count);
goto wait_credit;
}
wait_send_queue:
- wait_event(info->wait_post_send,
- atomic_read(&info->send_pending) < sp->send_credit_target ||
+ wait_event(sc->send_io.pending.dec_wait_queue,
+ atomic_read(&sc->send_io.pending.count) < sp->send_credit_target ||
sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
@@ -883,9 +1143,9 @@ wait_send_queue:
goto err_wait_send_queue;
}
- if (unlikely(atomic_inc_return(&info->send_pending) >
+ if (unlikely(atomic_inc_return(&sc->send_io.pending.count) >
sp->send_credit_target)) {
- atomic_dec(&info->send_pending);
+ atomic_dec(&sc->send_io.pending.count);
goto wait_send_queue;
}
@@ -898,10 +1158,30 @@ wait_send_queue:
request->socket = sc;
memset(request->sge, 0, sizeof(request->sge));
+ /* Map the packet to DMA */
+ header_length = sizeof(struct smbdirect_data_transfer);
+ /* If this is a packet without payload, don't send padding */
+ if (!iter)
+ header_length = offsetof(struct smbdirect_data_transfer, padding);
+
+ packet = smbdirect_send_io_payload(request);
+ request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
+ (void *)packet,
+ header_length,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
+ rc = -EIO;
+ goto err_dma;
+ }
+
+ request->sge[0].length = header_length;
+ request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
+ request->num_sge = 1;
+
/* Fill in the data payload to find out how much data we can add */
if (iter) {
struct smb_extract_to_rdma extract = {
- .nr_sge = 1,
+ .nr_sge = request->num_sge,
.max_sge = SMBDIRECT_SEND_IO_MAX_SGE,
.sge = request->sge,
.device = sc->ib.dev,
@@ -920,21 +1200,17 @@ wait_send_queue:
*_remaining_data_length -= data_length;
} else {
data_length = 0;
- request->num_sge = 1;
}
/* Fill in the packet header */
- packet = smbdirect_send_io_payload(request);
packet->credits_requested = cpu_to_le16(sp->send_credit_target);
- new_credits = manage_credits_prior_sending(info);
- atomic_add(new_credits, &info->receive_credits);
+ new_credits = manage_credits_prior_sending(sc);
+ atomic_add(new_credits, &sc->recv_io.credits.count);
packet->credits_granted = cpu_to_le16(new_credits);
- info->send_immediate = false;
-
packet->flags = 0;
- if (manage_keep_alive_before_sending(info))
+ if (manage_keep_alive_before_sending(sc))
packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
packet->reserved = 0;
@@ -953,26 +1229,7 @@ wait_send_queue:
le32_to_cpu(packet->data_length),
le32_to_cpu(packet->remaining_data_length));
- /* Map the packet to DMA */
- header_length = sizeof(struct smbdirect_data_transfer);
- /* If this is a packet without payload, don't send padding */
- if (!data_length)
- header_length = offsetof(struct smbdirect_data_transfer, padding);
-
- request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
- (void *)packet,
- header_length,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
- rc = -EIO;
- request->sge[0].addr = 0;
- goto err_dma;
- }
-
- request->sge[0].length = header_length;
- request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
-
- rc = smbd_post_send(info, request);
+ rc = smbd_post_send(sc, request);
if (!rc)
return 0;
@@ -985,19 +1242,16 @@ err_dma:
DMA_TO_DEVICE);
mempool_free(request, sc->send_io.mem.pool);
- /* roll back receive credits and credits to be offered */
- spin_lock(&info->lock_new_credits_offered);
- info->new_credits_offered += new_credits;
- spin_unlock(&info->lock_new_credits_offered);
- atomic_sub(new_credits, &info->receive_credits);
+ /* roll back the granted receive credits */
+ atomic_sub(new_credits, &sc->recv_io.credits.count);
err_alloc:
- if (atomic_dec_and_test(&info->send_pending))
- wake_up(&info->wait_send_pending);
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
err_wait_send_queue:
/* roll back send credits and pending */
- atomic_inc(&info->send_credits);
+ atomic_inc(&sc->send_io.credits.count);
err_wait_credit:
return rc;
@@ -1008,15 +1262,15 @@ err_wait_credit:
* Empty message is used to extend credits to peer to for keep live
* while there is no upper layer payload to send at the time
*/
-static int smbd_post_send_empty(struct smbd_connection *info)
+static int smbd_post_send_empty(struct smbdirect_socket *sc)
{
int remaining_data_length = 0;
- info->count_send_empty++;
- return smbd_post_send_iter(info, NULL, &remaining_data_length);
+ sc->statistics.send_empty++;
+ return smbd_post_send_iter(sc, NULL, &remaining_data_length);
}
-static int smbd_post_send_full_iter(struct smbd_connection *info,
+static int smbd_post_send_full_iter(struct smbdirect_socket *sc,
struct iov_iter *iter,
int *_remaining_data_length)
{
@@ -1029,7 +1283,7 @@ static int smbd_post_send_full_iter(struct smbd_connection *info,
*/
while (iov_iter_count(iter) > 0) {
- rc = smbd_post_send_iter(info, iter, _remaining_data_length);
+ rc = smbd_post_send_iter(sc, iter, _remaining_data_length);
if (rc < 0)
break;
}
@@ -1043,9 +1297,8 @@ static int smbd_post_send_full_iter(struct smbd_connection *info,
* The interaction is controlled by send/receive credit system
*/
static int smbd_post_recv(
- struct smbd_connection *info, struct smbdirect_recv_io *response)
+ struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_recv_wr recv_wr;
int rc = -EIO;
@@ -1071,7 +1324,7 @@ static int smbd_post_recv(
ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
response->sge.length, DMA_FROM_DEVICE);
response->sge.length = 0;
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
}
@@ -1079,31 +1332,36 @@ static int smbd_post_recv(
}
/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
-static int smbd_negotiate(struct smbd_connection *info)
+static int smbd_negotiate(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
int rc;
- struct smbdirect_recv_io *response = get_receive_buffer(info);
+ struct smbdirect_recv_io *response = get_receive_buffer(sc);
+
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
- rc = smbd_post_recv(info, response);
+ rc = smbd_post_recv(sc, response);
log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
rc, response->sge.addr,
response->sge.length, response->sge.lkey);
- if (rc)
+ if (rc) {
+ put_receive_buffer(sc, response);
return rc;
+ }
- init_completion(&info->negotiate_completion);
- info->negotiate_done = false;
- rc = smbd_post_send_negotiate_req(info);
+ rc = smbd_post_send_negotiate_req(sc);
if (rc)
return rc;
- rc = wait_for_completion_interruptible_timeout(
- &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
- log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
+ rc = wait_event_interruptible_timeout(
+ sc->status_wait,
+ sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
+ log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc);
- if (info->negotiate_done)
+ if (sc->status == SMBDIRECT_SOCKET_CONNECTED)
return 0;
if (rc == 0)
@@ -1127,13 +1385,13 @@ static int smbd_negotiate(struct smbd_connection *info)
* data_length: the size of payload in this packet
*/
static void enqueue_reassembly(
- struct smbd_connection *info,
+ struct smbdirect_socket *sc,
struct smbdirect_recv_io *response,
int data_length)
{
- struct smbdirect_socket *sc = &info->socket;
+ unsigned long flags;
- spin_lock(&sc->recv_io.reassembly.lock);
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
list_add_tail(&response->list, &sc->recv_io.reassembly.list);
sc->recv_io.reassembly.queue_length++;
/*
@@ -1144,9 +1402,8 @@ static void enqueue_reassembly(
*/
virt_wmb();
sc->recv_io.reassembly.data_length += data_length;
- spin_unlock(&sc->recv_io.reassembly.lock);
- info->count_reassembly_queue++;
- info->count_enqueue_reassembly_queue++;
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ sc->statistics.enqueue_reassembly_queue++;
}
/*
@@ -1154,9 +1411,8 @@ static void enqueue_reassembly(
* Caller is responsible for locking
* return value: the first entry if any, NULL if queue is empty
*/
-static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *info)
+static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_recv_io *ret = NULL;
if (!list_empty(&sc->recv_io.reassembly.list)) {
@@ -1173,9 +1429,8 @@ static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *i
* pre-allocated in advance.
* return value: the receive buffer, NULL if none is available
*/
-static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info)
+static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_recv_io *ret = NULL;
unsigned long flags;
@@ -1185,8 +1440,7 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info
&sc->recv_io.free.list,
struct smbdirect_recv_io, list);
list_del(&ret->list);
- info->count_receive_queue--;
- info->count_get_receive_buffer++;
+ sc->statistics.get_receive_buffer++;
}
spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
@@ -1200,9 +1454,8 @@ static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info
* receive buffer is returned.
*/
static void put_receive_buffer(
- struct smbd_connection *info, struct smbdirect_recv_io *response)
+ struct smbdirect_socket *sc, struct smbdirect_recv_io *response)
{
- struct smbdirect_socket *sc = &info->socket;
unsigned long flags;
if (likely(response->sge.length != 0)) {
@@ -1215,31 +1468,18 @@ static void put_receive_buffer(
spin_lock_irqsave(&sc->recv_io.free.lock, flags);
list_add_tail(&response->list, &sc->recv_io.free.list);
- info->count_receive_queue++;
- info->count_put_receive_buffer++;
+ sc->statistics.put_receive_buffer++;
spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
- queue_work(info->workqueue, &info->post_send_credits_work);
+ queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
}
/* Preallocate all receive buffer on transport establishment */
-static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
+static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_recv_io *response;
int i;
- INIT_LIST_HEAD(&sc->recv_io.reassembly.list);
- spin_lock_init(&sc->recv_io.reassembly.lock);
- sc->recv_io.reassembly.data_length = 0;
- sc->recv_io.reassembly.queue_length = 0;
-
- INIT_LIST_HEAD(&sc->recv_io.free.list);
- spin_lock_init(&sc->recv_io.free.lock);
- info->count_receive_queue = 0;
-
- init_waitqueue_head(&info->wait_receive_queues);
-
for (i = 0; i < num_buf; i++) {
response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
if (!response)
@@ -1248,7 +1488,6 @@ static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
response->socket = sc;
response->sge.length = 0;
list_add_tail(&response->list, &sc->recv_io.free.list);
- info->count_receive_queue++;
}
return 0;
@@ -1259,45 +1498,59 @@ allocate_failed:
&sc->recv_io.free.list,
struct smbdirect_recv_io, list);
list_del(&response->list);
- info->count_receive_queue--;
mempool_free(response, sc->recv_io.mem.pool);
}
return -ENOMEM;
}
-static void destroy_receive_buffers(struct smbd_connection *info)
+static void destroy_receive_buffers(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_recv_io *response;
- while ((response = get_receive_buffer(info)))
+ while ((response = get_receive_buffer(sc)))
mempool_free(response, sc->recv_io.mem.pool);
}
+static void send_immediate_empty_message(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, idle.immediate_work);
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return;
+
+ log_keep_alive(INFO, "send an empty message\n");
+ smbd_post_send_empty(sc);
+}
+
/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
static void idle_connection_timer(struct work_struct *work)
{
- struct smbd_connection *info = container_of(
- work, struct smbd_connection,
- idle_timer_work.work);
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, idle.timer_work.work);
struct smbdirect_socket_parameters *sp = &sc->parameters;
- if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
+ if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
log_keep_alive(ERR,
- "error status info->keep_alive_requested=%d\n",
- info->keep_alive_requested);
- smbd_disconnect_rdma_connection(info);
+ "error status sc->idle.keepalive=%d\n",
+ sc->idle.keepalive);
+ smbd_disconnect_rdma_connection(sc);
return;
}
- log_keep_alive(INFO, "about to send an empty idle message\n");
- smbd_post_send_empty(info);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return;
- /* Setup the next idle timeout work */
- queue_delayed_work(info->workqueue, &info->idle_timer_work,
- msecs_to_jiffies(sp->keepalive_interval_msec));
+ /*
+ * Now use the keepalive timeout (instead of keepalive interval)
+ * in order to wait for a response
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_timeout_msec));
+ log_keep_alive(INFO, "schedule send of empty idle message\n");
+ queue_work(sc->workqueue, &sc->idle.immediate_work);
}
/*
@@ -1309,7 +1562,6 @@ void smbd_destroy(struct TCP_Server_Info *server)
{
struct smbd_connection *info = server->smbd_conn;
struct smbdirect_socket *sc;
- struct smbdirect_socket_parameters *sp;
struct smbdirect_recv_io *response;
unsigned long flags;
@@ -1318,35 +1570,51 @@ void smbd_destroy(struct TCP_Server_Info *server)
return;
}
sc = &info->socket;
- sp = &sc->parameters;
+
+ log_rdma_event(INFO, "cancelling and disable disconnect_work\n");
+ disable_work_sync(&sc->disconnect_work);
log_rdma_event(INFO, "destroying rdma session\n");
- if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) {
- rdma_disconnect(sc->rdma.cm_id);
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+ smbd_disconnect_rdma_work(&sc->disconnect_work);
log_rdma_event(INFO, "wait for transport being disconnected\n");
wait_event_interruptible(
- info->status_wait,
+ sc->status_wait,
sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
}
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ *
+ * Most likely this was already called via
+ * smbd_disconnect_rdma_work(), but call it again...
+ */
+ smbd_disconnect_wake_up_all(sc);
+
+ log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n");
+ disable_work_sync(&sc->recv_io.posted.refill_work);
+
log_rdma_event(INFO, "destroying qp\n");
ib_drain_qp(sc->ib.qp);
rdma_destroy_qp(sc->rdma.cm_id);
sc->ib.qp = NULL;
log_rdma_event(INFO, "cancelling idle timer\n");
- cancel_delayed_work_sync(&info->idle_timer_work);
+ disable_delayed_work_sync(&sc->idle.timer_work);
+ log_rdma_event(INFO, "cancelling send immediate work\n");
+ disable_work_sync(&sc->idle.immediate_work);
/* It's not possible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
- response = _get_first_reassembly(info);
+ response = _get_first_reassembly(sc);
if (response) {
list_del(&response->list);
spin_unlock_irqrestore(
&sc->recv_io.reassembly.lock, flags);
- put_receive_buffer(info, response);
+ put_receive_buffer(sc, response);
} else
spin_unlock_irqrestore(
&sc->recv_io.reassembly.lock, flags);
@@ -1354,9 +1622,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
sc->recv_io.reassembly.data_length = 0;
log_rdma_event(INFO, "free receive buffers\n");
- wait_event(info->wait_receive_queues,
- info->count_receive_queue == sp->recv_credit_max);
- destroy_receive_buffers(info);
+ destroy_receive_buffers(sc);
/*
* For performance reasons, memory registration and deregistration
@@ -1366,13 +1632,12 @@ void smbd_destroy(struct TCP_Server_Info *server)
* path when sending data, and then release memory registrations.
*/
log_rdma_event(INFO, "freeing mr list\n");
- wake_up_interruptible_all(&info->wait_mr);
- while (atomic_read(&info->mr_used_count)) {
+ while (atomic_read(&sc->mr_io.used.count)) {
cifs_server_unlock(server);
msleep(1000);
cifs_server_lock(server);
}
- destroy_mr_list(info);
+ destroy_mr_list(sc);
ib_free_cq(sc->ib.send_cq);
ib_free_cq(sc->ib.recv_cq);
@@ -1388,7 +1653,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
sc->status = SMBDIRECT_SOCKET_DESTROYED;
- destroy_workqueue(info->workqueue);
+ destroy_workqueue(sc->workqueue);
log_rdma_event(INFO, "rdma session destroyed\n");
kfree(info);
server->smbd_conn = NULL;
@@ -1430,12 +1695,9 @@ create_conn:
return -ENOENT;
}
-static void destroy_caches_and_workqueue(struct smbd_connection *info)
+static void destroy_caches(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
-
- destroy_receive_buffers(info);
- destroy_workqueue(info->workqueue);
+ destroy_receive_buffers(sc);
mempool_destroy(sc->recv_io.mem.pool);
kmem_cache_destroy(sc->recv_io.mem.cache);
mempool_destroy(sc->send_io.mem.pool);
@@ -1443,9 +1705,8 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info)
}
#define MAX_NAME_LEN 80
-static int allocate_caches_and_workqueue(struct smbd_connection *info)
+static int allocate_caches(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
struct smbdirect_socket_parameters *sp = &sc->parameters;
char name[MAX_NAME_LEN];
int rc;
@@ -1453,7 +1714,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
return -ENOMEM;
- scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc);
sc->send_io.mem.cache =
kmem_cache_create(
name,
@@ -1469,7 +1730,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!sc->send_io.mem.pool)
goto out1;
- scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", info);
+ scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc);
struct kmem_cache_args response_args = {
.align = __alignof__(struct smbdirect_recv_io),
@@ -1490,21 +1751,14 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!sc->recv_io.mem.pool)
goto out3;
- scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
- info->workqueue = create_workqueue(name);
- if (!info->workqueue)
- goto out4;
-
- rc = allocate_receive_buffers(info, sp->recv_credit_max);
+ rc = allocate_receive_buffers(sc, sp->recv_credit_max);
if (rc) {
log_rdma_event(ERR, "failed to allocate receive buffers\n");
- goto out5;
+ goto out4;
}
return 0;
-out5:
- destroy_workqueue(info->workqueue);
out4:
mempool_destroy(sc->recv_io.mem.pool);
out3:
@@ -1528,46 +1782,63 @@ static struct smbd_connection *_smbd_get_connection(
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
struct ib_port_immutable port_immutable;
- u32 ird_ord_hdr[2];
+ __be32 ird_ord_hdr[2];
+ char wq_name[80];
+ struct workqueue_struct *workqueue;
info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
if (!info)
return NULL;
sc = &info->socket;
+ scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc);
+ workqueue = create_workqueue(wq_name);
+ if (!workqueue)
+ goto create_wq_failed;
+ smbdirect_socket_init(sc);
+ sc->workqueue = workqueue;
sp = &sc->parameters;
- sc->status = SMBDIRECT_SOCKET_CONNECTING;
- rc = smbd_ia_open(info, dstaddr, port);
+ INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work);
+
+ sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT;
+ sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT;
+ sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT;
+ sp->negotiate_timeout_msec = SMBD_NEGOTIATE_TIMEOUT * 1000;
+ sp->initiator_depth = 1;
+ sp->responder_resources = SMBD_CM_RESPONDER_RESOURCES;
+ sp->recv_credit_max = smbd_receive_credit_max;
+ sp->send_credit_target = smbd_send_credit_target;
+ sp->max_send_size = smbd_max_send_size;
+ sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
+ sp->max_recv_size = smbd_max_receive_size;
+ sp->max_frmr_depth = smbd_max_frmr_depth;
+ sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
+ sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000;
+
+ rc = smbd_ia_open(sc, dstaddr, port);
if (rc) {
log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
goto create_id_failed;
}
- if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe ||
- smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
+ if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe ||
+ sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
- smbd_send_credit_target,
+ sp->send_credit_target,
sc->ib.dev->attrs.max_cqe,
sc->ib.dev->attrs.max_qp_wr);
goto config_failed;
}
- if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe ||
- smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) {
+ if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe ||
+ sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) {
log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
- smbd_receive_credit_max,
+ sp->recv_credit_max,
sc->ib.dev->attrs.max_cqe,
sc->ib.dev->attrs.max_qp_wr);
goto config_failed;
}
- sp->recv_credit_max = smbd_receive_credit_max;
- sp->send_credit_target = smbd_send_credit_target;
- sp->max_send_size = smbd_max_send_size;
- sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
- sp->max_recv_size = smbd_max_receive_size;
- sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
-
if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
log_rdma_event(ERR,
@@ -1579,8 +1850,16 @@ static struct smbd_connection *_smbd_get_connection(
goto config_failed;
}
+ sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+ if (IS_ERR(sc->ib.pd)) {
+ rc = PTR_ERR(sc->ib.pd);
+ sc->ib.pd = NULL;
+ log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
+ goto alloc_pd_failed;
+ }
+
sc->ib.send_cq =
- ib_alloc_cq_any(sc->ib.dev, info,
+ ib_alloc_cq_any(sc->ib.dev, sc,
sp->send_credit_target, IB_POLL_SOFTIRQ);
if (IS_ERR(sc->ib.send_cq)) {
sc->ib.send_cq = NULL;
@@ -1588,7 +1867,7 @@ static struct smbd_connection *_smbd_get_connection(
}
sc->ib.recv_cq =
- ib_alloc_cq_any(sc->ib.dev, info,
+ ib_alloc_cq_any(sc->ib.dev, sc,
sp->recv_credit_max, IB_POLL_SOFTIRQ);
if (IS_ERR(sc->ib.recv_cq)) {
sc->ib.recv_cq = NULL;
@@ -1597,7 +1876,7 @@ static struct smbd_connection *_smbd_get_connection(
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smbd_qp_async_error_upcall;
- qp_attr.qp_context = info;
+ qp_attr.qp_context = sc;
qp_attr.cap.max_send_wr = sp->send_credit_target;
qp_attr.cap.max_recv_wr = sp->recv_credit_max;
qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
@@ -1616,22 +1895,22 @@ static struct smbd_connection *_smbd_get_connection(
}
sc->ib.qp = sc->rdma.cm_id->qp;
- memset(&conn_param, 0, sizeof(conn_param));
- conn_param.initiator_depth = 0;
-
- conn_param.responder_resources =
- min(sc->ib.dev->attrs.max_qp_rd_atom,
- SMBD_CM_RESPONDER_RESOURCES);
- info->responder_resources = conn_param.responder_resources;
+ sp->responder_resources =
+ min_t(u8, sp->responder_resources,
+ sc->ib.dev->attrs.max_qp_rd_atom);
log_rdma_mr(INFO, "responder_resources=%d\n",
- info->responder_resources);
+ sp->responder_resources);
+
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.initiator_depth = sp->initiator_depth;
+ conn_param.responder_resources = sp->responder_resources;
/* Need to send IRD/ORD in private data for iWARP */
sc->ib.dev->ops.get_port_immutable(
sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
- ird_ord_hdr[0] = info->responder_resources;
- ird_ord_hdr[1] = 1;
+ ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
+ ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
conn_param.private_data = ird_ord_hdr;
conn_param.private_data_len = sizeof(ird_ord_hdr);
} else {
@@ -1646,8 +1925,8 @@ static struct smbd_connection *_smbd_get_connection(
log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
&addr_in->sin_addr, port);
- init_waitqueue_head(&info->status_wait);
- init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
rc = rdma_connect(sc->rdma.cm_id, &conn_param);
if (rc) {
log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
@@ -1655,45 +1934,42 @@ static struct smbd_connection *_smbd_get_connection(
}
wait_event_interruptible_timeout(
- info->status_wait,
- sc->status != SMBDIRECT_SOCKET_CONNECTING,
- msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ sc->status_wait,
+ sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
+ msecs_to_jiffies(sp->rdma_connect_timeout_msec));
- if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) {
log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
goto rdma_connect_failed;
}
log_rdma_event(INFO, "rdma_connect connected\n");
- rc = allocate_caches_and_workqueue(info);
+ rc = allocate_caches(sc);
if (rc) {
log_rdma_event(ERR, "cache allocation failed\n");
goto allocate_cache_failed;
}
- init_waitqueue_head(&info->wait_send_queue);
- INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
- queue_delayed_work(info->workqueue, &info->idle_timer_work,
- msecs_to_jiffies(sp->keepalive_interval_msec));
-
- init_waitqueue_head(&info->wait_send_pending);
- atomic_set(&info->send_pending, 0);
-
- init_waitqueue_head(&info->wait_post_send);
+ INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message);
+ INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer);
+ /*
+ * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
+ * so that the timer will cause a disconnect.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
- INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
- INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
- info->new_credits_offered = 0;
- spin_lock_init(&info->lock_new_credits_offered);
+ INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits);
- rc = smbd_negotiate(info);
+ rc = smbd_negotiate(sc);
if (rc) {
log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
goto negotiation_failed;
}
- rc = allocate_mr_list(info);
+ rc = allocate_mr_list(sc);
if (rc) {
log_rdma_mr(ERR, "memory registration allocation failed\n");
goto allocate_mr_failed;
@@ -1708,11 +1984,11 @@ allocate_mr_failed:
return NULL;
negotiation_failed:
- cancel_delayed_work_sync(&info->idle_timer_work);
- destroy_caches_and_workqueue(info);
+ disable_delayed_work_sync(&sc->idle.timer_work);
+ destroy_caches(sc);
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
rdma_disconnect(sc->rdma.cm_id);
- wait_event(info->status_wait,
+ wait_event(sc->status_wait,
sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
allocate_cache_failed:
@@ -1726,11 +2002,15 @@ alloc_cq_failed:
if (sc->ib.recv_cq)
ib_free_cq(sc->ib.recv_cq);
-config_failed:
ib_dealloc_pd(sc->ib.pd);
+
+alloc_pd_failed:
+config_failed:
rdma_destroy_id(sc->rdma.cm_id);
create_id_failed:
+ destroy_workqueue(sc->workqueue);
+create_wq_failed:
kfree(info);
return NULL;
}
@@ -1739,6 +2019,7 @@ struct smbd_connection *smbd_get_connection(
struct TCP_Server_Info *server, struct sockaddr *dstaddr)
{
struct smbd_connection *ret;
+ const struct smbdirect_socket_parameters *sp;
int port = SMBD_PORT;
try_again:
@@ -1749,6 +2030,16 @@ try_again:
port = SMB_PORT;
goto try_again;
}
+ if (!ret)
+ return NULL;
+
+ sp = &ret->socket.parameters;
+
+ server->rdma_readwrite_threshold =
+ rdma_readwrite_threshold > sp->max_fragmented_send_size ?
+ sp->max_fragmented_send_size :
+ rdma_readwrite_threshold;
+
return ret;
}
@@ -1790,6 +2081,7 @@ again:
if (sc->recv_io.reassembly.data_length >= size) {
int queue_length;
int queue_removed = 0;
+ unsigned long flags;
/*
* Need to make sure reassembly_data_length is read before
@@ -1804,7 +2096,7 @@ again:
to_read = size;
offset = sc->recv_io.reassembly.first_entry_offset;
while (data_read < size) {
- response = _get_first_reassembly(info);
+ response = _get_first_reassembly(sc);
data_transfer = smbdirect_recv_io_payload(response);
data_length = le32_to_cpu(data_transfer->data_length);
remaining_data_length =
@@ -1849,16 +2141,15 @@ again:
if (queue_length)
list_del(&response->list);
else {
- spin_lock_irq(
- &sc->recv_io.reassembly.lock);
+ spin_lock_irqsave(
+ &sc->recv_io.reassembly.lock, flags);
list_del(&response->list);
- spin_unlock_irq(
- &sc->recv_io.reassembly.lock);
+ spin_unlock_irqrestore(
+ &sc->recv_io.reassembly.lock, flags);
}
queue_removed++;
- info->count_reassembly_queue--;
- info->count_dequeue_reassembly_queue++;
- put_receive_buffer(info, response);
+ sc->statistics.dequeue_reassembly_queue++;
+ put_receive_buffer(sc, response);
offset = 0;
log_read(INFO, "put_receive_buffer offset=0\n");
} else
@@ -1872,10 +2163,10 @@ again:
to_read, data_read, offset);
}
- spin_lock_irq(&sc->recv_io.reassembly.lock);
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
sc->recv_io.reassembly.data_length -= data_read;
sc->recv_io.reassembly.queue_length -= queue_removed;
- spin_unlock_irq(&sc->recv_io.reassembly.lock);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
sc->recv_io.reassembly.first_entry_offset = offset;
log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
@@ -1960,13 +2251,13 @@ int smbd_send(struct TCP_Server_Info *server,
klen += rqst->rq_iov[i].iov_len;
iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
- rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length);
+ rc = smbd_post_send_full_iter(sc, &iter, &remaining_data_length);
if (rc < 0)
break;
if (iov_iter_count(&rqst->rq_iter) > 0) {
/* And then the data pages if there are any */
- rc = smbd_post_send_full_iter(info, &rqst->rq_iter,
+ rc = smbd_post_send_full_iter(sc, &rqst->rq_iter,
&remaining_data_length);
if (rc < 0)
break;
@@ -1981,8 +2272,8 @@ int smbd_send(struct TCP_Server_Info *server,
* that means all the I/Os have been out and we are good to return
*/
- wait_event(info->wait_send_pending,
- atomic_read(&info->send_pending) == 0 ||
+ wait_event(sc->send_io.pending.zero_wait_queue,
+ atomic_read(&sc->send_io.pending.count) == 0 ||
sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
@@ -1993,14 +2284,13 @@ int smbd_send(struct TCP_Server_Info *server,
static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smbd_mr *mr;
- struct ib_cqe *cqe;
+ struct smbdirect_mr_io *mr =
+ container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
+ struct smbdirect_socket *sc = mr->socket;
if (wc->status) {
log_rdma_mr(ERR, "status=%d\n", wc->status);
- cqe = wc->wr_cqe;
- mr = container_of(cqe, struct smbd_mr, cqe);
- smbd_disconnect_rdma_connection(mr->conn);
+ smbd_disconnect_rdma_connection(sc);
}
}
@@ -2015,14 +2305,14 @@ static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
*/
static void smbd_mr_recovery_work(struct work_struct *work)
{
- struct smbd_connection *info =
- container_of(work, struct smbd_connection, mr_recovery_work);
- struct smbdirect_socket *sc = &info->socket;
- struct smbd_mr *smbdirect_mr;
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, mr_io.recovery_work);
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_mr_io *smbdirect_mr;
int rc;
- list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
- if (smbdirect_mr->state == MR_ERROR) {
+ list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) {
+ if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2030,25 +2320,25 @@ static void smbd_mr_recovery_work(struct work_struct *work)
log_rdma_mr(ERR,
"ib_dereg_mr failed rc=%x\n",
rc);
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
continue;
}
smbdirect_mr->mr = ib_alloc_mr(
- sc->ib.pd, info->mr_type,
- info->max_frmr_depth);
+ sc->ib.pd, sc->mr_io.type,
+ sp->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
- info->mr_type,
- info->max_frmr_depth);
- smbd_disconnect_rdma_connection(info);
+ sc->mr_io.type,
+ sp->max_frmr_depth);
+ smbd_disconnect_rdma_connection(sc);
continue;
}
} else
/* This MR is being used, don't recover it */
continue;
- smbdirect_mr->state = MR_READY;
+ smbdirect_mr->state = SMBDIRECT_MR_READY;
/* smbdirect_mr->state is updated by this function
* and is read and updated by I/O issuing CPUs trying
@@ -2057,19 +2347,18 @@ static void smbd_mr_recovery_work(struct work_struct *work)
* value is updated before waking up any calls to
* get_mr() from the I/O issuing CPUs
*/
- if (atomic_inc_return(&info->mr_ready_count) == 1)
- wake_up_interruptible(&info->wait_mr);
+ if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
+ wake_up(&sc->mr_io.ready.wait_queue);
}
}
-static void destroy_mr_list(struct smbd_connection *info)
+static void destroy_mr_list(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
- struct smbd_mr *mr, *tmp;
+ struct smbdirect_mr_io *mr, *tmp;
- cancel_work_sync(&info->mr_recovery_work);
- list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
- if (mr->state == MR_INVALIDATED)
+ disable_work_sync(&sc->mr_io.recovery_work);
+ list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) {
+ if (mr->state == SMBDIRECT_MR_INVALIDATED)
ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
mr->sgt.nents, mr->dir);
ib_dereg_mr(mr->mr);
@@ -2085,32 +2374,32 @@ static void destroy_mr_list(struct smbd_connection *info)
* Recovery is done in smbd_mr_recovery_work. The content of list entry changes
* as MRs are used and recovered for I/O, but the list links will not change
*/
-static int allocate_mr_list(struct smbd_connection *info)
+static int allocate_mr_list(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
int i;
- struct smbd_mr *smbdirect_mr, *tmp;
-
- INIT_LIST_HEAD(&info->mr_list);
- init_waitqueue_head(&info->wait_mr);
- spin_lock_init(&info->mr_list_lock);
- atomic_set(&info->mr_ready_count, 0);
- atomic_set(&info->mr_used_count, 0);
- init_waitqueue_head(&info->wait_for_mr_cleanup);
- INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
+ struct smbdirect_mr_io *smbdirect_mr, *tmp;
+
+ INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work);
+
+ if (sp->responder_resources == 0) {
+ log_rdma_mr(ERR, "responder_resources negotiated as 0\n");
+ return -EINVAL;
+ }
+
/* Allocate more MRs (2x) than hardware responder_resources */
- for (i = 0; i < info->responder_resources * 2; i++) {
+ for (i = 0; i < sp->responder_resources * 2; i++) {
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
if (!smbdirect_mr)
goto cleanup_entries;
- smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type,
- info->max_frmr_depth);
+ smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type,
+ sp->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
- info->mr_type, info->max_frmr_depth);
+ sc->mr_io.type, sp->max_frmr_depth);
goto out;
}
- smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth,
+ smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth,
sizeof(struct scatterlist),
GFP_KERNEL);
if (!smbdirect_mr->sgt.sgl) {
@@ -2118,18 +2407,18 @@ static int allocate_mr_list(struct smbd_connection *info)
ib_dereg_mr(smbdirect_mr->mr);
goto out;
}
- smbdirect_mr->state = MR_READY;
- smbdirect_mr->conn = info;
+ smbdirect_mr->state = SMBDIRECT_MR_READY;
+ smbdirect_mr->socket = sc;
- list_add_tail(&smbdirect_mr->list, &info->mr_list);
- atomic_inc(&info->mr_ready_count);
+ list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list);
+ atomic_inc(&sc->mr_io.ready.count);
}
return 0;
out:
kfree(smbdirect_mr);
cleanup_entries:
- list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
+ list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) {
list_del(&smbdirect_mr->list);
ib_dereg_mr(smbdirect_mr->mr);
kfree(smbdirect_mr->sgt.sgl);
@@ -2146,14 +2435,14 @@ cleanup_entries:
* issuing I/O trying to get MR at the same time, mr_list_lock is used to
* protect this situation.
*/
-static struct smbd_mr *get_mr(struct smbd_connection *info)
+static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc)
{
- struct smbdirect_socket *sc = &info->socket;
- struct smbd_mr *ret;
+ struct smbdirect_mr_io *ret;
+ unsigned long flags;
int rc;
again:
- rc = wait_event_interruptible(info->wait_mr,
- atomic_read(&info->mr_ready_count) ||
+ rc = wait_event_interruptible(sc->mr_io.ready.wait_queue,
+ atomic_read(&sc->mr_io.ready.count) ||
sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc) {
log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
@@ -2165,18 +2454,18 @@ again:
return NULL;
}
- spin_lock(&info->mr_list_lock);
- list_for_each_entry(ret, &info->mr_list, list) {
- if (ret->state == MR_READY) {
- ret->state = MR_REGISTERED;
- spin_unlock(&info->mr_list_lock);
- atomic_dec(&info->mr_ready_count);
- atomic_inc(&info->mr_used_count);
+ spin_lock_irqsave(&sc->mr_io.all.lock, flags);
+ list_for_each_entry(ret, &sc->mr_io.all.list, list) {
+ if (ret->state == SMBDIRECT_MR_READY) {
+ ret->state = SMBDIRECT_MR_REGISTERED;
+ spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
+ atomic_dec(&sc->mr_io.ready.count);
+ atomic_inc(&sc->mr_io.used.count);
return ret;
}
}
- spin_unlock(&info->mr_list_lock);
+ spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
/*
* It is possible that we could fail to get MR because other processes may
* try to acquire a MR at the same time. If this is the case, retry it.
@@ -2187,8 +2476,7 @@ again:
/*
* Transcribe the pages from an iterator into an MR scatterlist.
*/
-static int smbd_iter_to_mr(struct smbd_connection *info,
- struct iov_iter *iter,
+static int smbd_iter_to_mr(struct iov_iter *iter,
struct sg_table *sgt,
unsigned int max_sg)
{
@@ -2210,25 +2498,26 @@ static int smbd_iter_to_mr(struct smbd_connection *info,
* need_invalidate: true if this MR needs to be locally invalidated after I/O
* return value: the MR registered, NULL if failed.
*/
-struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
+struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info,
struct iov_iter *iter,
bool writing, bool need_invalidate)
{
struct smbdirect_socket *sc = &info->socket;
- struct smbd_mr *smbdirect_mr;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_mr_io *smbdirect_mr;
int rc, num_pages;
enum dma_data_direction dir;
struct ib_reg_wr *reg_wr;
- num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1);
- if (num_pages > info->max_frmr_depth) {
+ num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
+ if (num_pages > sp->max_frmr_depth) {
log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
- num_pages, info->max_frmr_depth);
+ num_pages, sp->max_frmr_depth);
WARN_ON_ONCE(1);
return NULL;
}
- smbdirect_mr = get_mr(info);
+ smbdirect_mr = get_mr(sc);
if (!smbdirect_mr) {
log_rdma_mr(ERR, "get_mr returning NULL\n");
return NULL;
@@ -2241,8 +2530,8 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
smbdirect_mr->sgt.orig_nents = 0;
log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
- num_pages, iov_iter_count(iter), info->max_frmr_depth);
- smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
+ num_pages, iov_iter_count(iter), sp->max_frmr_depth);
+ smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth);
rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents, dir);
@@ -2287,32 +2576,32 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
rc, reg_wr->key);
- /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
+ /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/
map_mr_error:
ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents, smbdirect_mr->dir);
dma_map_error:
- smbdirect_mr->state = MR_ERROR;
- if (atomic_dec_and_test(&info->mr_used_count))
- wake_up(&info->wait_for_mr_cleanup);
+ smbdirect_mr->state = SMBDIRECT_MR_ERROR;
+ if (atomic_dec_and_test(&sc->mr_io.used.count))
+ wake_up(&sc->mr_io.cleanup.wait_queue);
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
return NULL;
}
static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smbd_mr *smbdirect_mr;
+ struct smbdirect_mr_io *smbdirect_mr;
struct ib_cqe *cqe;
cqe = wc->wr_cqe;
- smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
- smbdirect_mr->state = MR_INVALIDATED;
+ smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe);
+ smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
if (wc->status != IB_WC_SUCCESS) {
log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
- smbdirect_mr->state = MR_ERROR;
+ smbdirect_mr->state = SMBDIRECT_MR_ERROR;
}
complete(&smbdirect_mr->invalidate_done);
}
@@ -2323,11 +2612,10 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
* and we have to locally invalidate the buffer to prevent data is being
* modified by remote peer after upper layer consumes it
*/
-int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
+int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr)
{
struct ib_send_wr *wr;
- struct smbd_connection *info = smbdirect_mr->conn;
- struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket *sc = smbdirect_mr->socket;
int rc = 0;
if (smbdirect_mr->need_invalidate) {
@@ -2344,36 +2632,36 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
rc = ib_post_send(sc->ib.qp, wr, NULL);
if (rc) {
log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
- smbd_disconnect_rdma_connection(info);
+ smbd_disconnect_rdma_connection(sc);
goto done;
}
wait_for_completion(&smbdirect_mr->invalidate_done);
smbdirect_mr->need_invalidate = false;
} else
/*
- * For remote invalidation, just set it to MR_INVALIDATED
+ * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
* and defer to mr_recovery_work to recover the MR for next use
*/
- smbdirect_mr->state = MR_INVALIDATED;
+ smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED;
- if (smbdirect_mr->state == MR_INVALIDATED) {
+ if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) {
ib_dma_unmap_sg(
sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents,
smbdirect_mr->dir);
- smbdirect_mr->state = MR_READY;
- if (atomic_inc_return(&info->mr_ready_count) == 1)
- wake_up_interruptible(&info->wait_mr);
+ smbdirect_mr->state = SMBDIRECT_MR_READY;
+ if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
+ wake_up(&sc->mr_io.ready.wait_queue);
} else
/*
* Schedule the work to do MR recovery for future I/Os MR
* recovery is slow and don't want it to block current I/O
*/
- queue_work(info->workqueue, &info->mr_recovery_work);
+ queue_work(sc->workqueue, &sc->mr_io.recovery_work);
done:
- if (atomic_dec_and_test(&info->mr_used_count))
- wake_up(&info->wait_for_mr_cleanup);
+ if (atomic_dec_and_test(&sc->mr_io.used.count))
+ wake_up(&sc->mr_io.cleanup.wait_queue);
return rc;
}
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index e45aa9ddd71d..d67ac5ddaff4 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -27,12 +27,6 @@ extern int smbd_max_send_size;
extern int smbd_send_credit_target;
extern int smbd_receive_credit_max;
-enum keep_alive_status {
- KEEP_ALIVE_NONE,
- KEEP_ALIVE_PENDING,
- KEEP_ALIVE_SENT,
-};
-
/*
* The context for the SMBDirect transport
* Everything related to the transport is here. It has several logical parts
@@ -44,79 +38,14 @@ enum keep_alive_status {
*/
struct smbd_connection {
struct smbdirect_socket socket;
-
- int ri_rc;
- struct completion ri_done;
- wait_queue_head_t status_wait;
-
- struct completion negotiate_completion;
- bool negotiate_done;
-
- struct work_struct disconnect_work;
- struct work_struct post_send_credits_work;
-
- spinlock_t lock_new_credits_offered;
- int new_credits_offered;
-
- /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */
- enum keep_alive_status keep_alive_requested;
- int protocol;
- atomic_t send_credits;
- atomic_t receive_credits;
- int receive_credit_target;
-
- /* Memory registrations */
- /* Maximum number of RDMA read/write outstanding on this connection */
- int responder_resources;
- /* Maximum number of pages in a single RDMA write/read on this connection */
- int max_frmr_depth;
- /*
- * If payload is less than or equal to the threshold,
- * use RDMA send/recv to send upper layer I/O.
- * If payload is more than the threshold,
- * use RDMA read/write through memory registration for I/O.
- */
- int rdma_readwrite_threshold;
- enum ib_mr_type mr_type;
- struct list_head mr_list;
- spinlock_t mr_list_lock;
- /* The number of available MRs ready for memory registration */
- atomic_t mr_ready_count;
- atomic_t mr_used_count;
- wait_queue_head_t wait_mr;
- struct work_struct mr_recovery_work;
- /* Used by transport to wait until all MRs are returned */
- wait_queue_head_t wait_for_mr_cleanup;
-
- /* Activity accounting */
- atomic_t send_pending;
- wait_queue_head_t wait_send_pending;
- wait_queue_head_t wait_post_send;
-
- /* Receive queue */
- int count_receive_queue;
- wait_queue_head_t wait_receive_queues;
-
- bool send_immediate;
-
- wait_queue_head_t wait_send_queue;
-
- struct workqueue_struct *workqueue;
- struct delayed_work idle_timer_work;
-
- /* for debug purposes */
- unsigned int count_get_receive_buffer;
- unsigned int count_put_receive_buffer;
- unsigned int count_reassembly_queue;
- unsigned int count_enqueue_reassembly_queue;
- unsigned int count_dequeue_reassembly_queue;
- unsigned int count_send_empty;
};
/* Create a SMBDirect session */
struct smbd_connection *smbd_get_connection(
struct TCP_Server_Info *server, struct sockaddr *dstaddr);
+const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn);
+
/* Reconnect SMBDirect session */
int smbd_reconnect(struct TCP_Server_Info *server);
/* Destroy SMBDirect session */
@@ -127,34 +56,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg);
int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst);
-enum mr_state {
- MR_READY,
- MR_REGISTERED,
- MR_INVALIDATED,
- MR_ERROR
-};
-
-struct smbd_mr {
- struct smbd_connection *conn;
- struct list_head list;
- enum mr_state state;
- struct ib_mr *mr;
- struct sg_table sgt;
- enum dma_data_direction dir;
- union {
- struct ib_reg_wr wr;
- struct ib_send_wr inv_wr;
- };
- struct ib_cqe cqe;
- bool need_invalidate;
- struct completion invalidate_done;
-};
-
/* Interfaces to register and deregister MR for RDMA read/write */
-struct smbd_mr *smbd_register_mr(
+struct smbdirect_mr_io *smbd_register_mr(
struct smbd_connection *info, struct iov_iter *iter,
bool writing, bool need_invalidate);
-int smbd_deregister_mr(struct smbd_mr *mr);
+int smbd_deregister_mr(struct smbdirect_mr_io *mr);
#else
#define cifs_rdma_enabled(server) 0
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 93e5b2bb9f28..fd650e2afc76 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
@@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
@@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
@@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
__u64 lease_key_high), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done);
+/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */
+DECLARE_EVENT_CLASS(smb3_lease_enter_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 flags,
+ __u16 epoch,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high),
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, flags)
+ __field(__u16, epoch)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->flags = flags;
+ __entry->epoch = epoch;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch)
+)
+
+#define DEFINE_SMB3_LEASE_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_lease_enter_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 flags, \
+ __u16 epoch, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high), \
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter);
+/* Lease not found: reuse lease_enter payload (includes epoch and flags) */
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found);
DECLARE_EVENT_CLASS(smb3_lease_err_class,
TP_PROTO(__u32 lease_state,
@@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
int rc), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
-DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err);
DECLARE_EVENT_CLASS(smb3_connect_class,
TP_PROTO(char *hostname,
diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h
index b9a385344ff3..05cc6a9d0ccd 100644
--- a/fs/smb/common/smbdirect/smbdirect.h
+++ b/fs/smb/common/smbdirect/smbdirect.h
@@ -23,6 +23,12 @@ struct smbdirect_buffer_descriptor_v1 {
* Some values are important for the upper layer.
*/
struct smbdirect_socket_parameters {
+ __u32 resolve_addr_timeout_msec;
+ __u32 resolve_route_timeout_msec;
+ __u32 rdma_connect_timeout_msec;
+ __u32 negotiate_timeout_msec;
+ __u8 initiator_depth;
+ __u8 responder_resources;
__u16 recv_credit_max;
__u16 send_credit_target;
__u32 max_send_size;
@@ -30,6 +36,7 @@ struct smbdirect_socket_parameters {
__u32 max_recv_size;
__u32 max_fragmented_recv_size;
__u32 max_read_write_size;
+ __u32 max_frmr_depth;
__u32 keepalive_interval_msec;
__u32 keepalive_timeout_msec;
} __packed;
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
index 3c4a8d627aa3..db22a1d0546b 100644
--- a/fs/smb/common/smbdirect/smbdirect_socket.h
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -6,22 +6,102 @@
#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
+#include <rdma/rw.h>
+
enum smbdirect_socket_status {
SMBDIRECT_SOCKET_CREATED,
- SMBDIRECT_SOCKET_CONNECTING,
- SMBDIRECT_SOCKET_CONNECTED,
+ SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED,
+ SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
+ SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED,
+ SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED,
+ SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING,
+ SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED,
+ SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED,
+ SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING,
+ SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED,
+ SMBDIRECT_SOCKET_NEGOTIATE_NEEDED,
+ SMBDIRECT_SOCKET_NEGOTIATE_RUNNING,
SMBDIRECT_SOCKET_NEGOTIATE_FAILED,
+ SMBDIRECT_SOCKET_CONNECTED,
+ SMBDIRECT_SOCKET_ERROR,
SMBDIRECT_SOCKET_DISCONNECTING,
SMBDIRECT_SOCKET_DISCONNECTED,
SMBDIRECT_SOCKET_DESTROYED
};
+static __always_inline
+const char *smbdirect_socket_status_string(enum smbdirect_socket_status status)
+{
+ switch (status) {
+ case SMBDIRECT_SOCKET_CREATED:
+ return "CREATED";
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ return "RESOLVE_ADDR_NEEDED";
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ return "RESOLVE_ADDR_RUNNING";
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ return "RESOLVE_ADDR_FAILED";
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ return "RESOLVE_ROUTE_NEEDED";
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ return "RESOLVE_ROUTE_RUNNING";
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ return "RESOLVE_ROUTE_FAILED";
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ return "RDMA_CONNECT_NEEDED";
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ return "RDMA_CONNECT_RUNNING";
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ return "RDMA_CONNECT_FAILED";
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ return "NEGOTIATE_NEEDED";
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ return "NEGOTIATE_RUNNING";
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ return "NEGOTIATE_FAILED";
+ case SMBDIRECT_SOCKET_CONNECTED:
+ return "CONNECTED";
+ case SMBDIRECT_SOCKET_ERROR:
+ return "ERROR";
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ return "DISCONNECTING";
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ return "DISCONNECTED";
+ case SMBDIRECT_SOCKET_DESTROYED:
+ return "DESTROYED";
+ }
+
+ return "<unknown>";
+}
+
+enum smbdirect_keepalive_status {
+ SMBDIRECT_KEEPALIVE_NONE,
+ SMBDIRECT_KEEPALIVE_PENDING,
+ SMBDIRECT_KEEPALIVE_SENT
+};
+
struct smbdirect_socket {
enum smbdirect_socket_status status;
+ wait_queue_head_t status_wait;
+ int first_error;
+
+ /*
+ * This points to the workqueue to
+ * be used for this socket.
+ * It can be per socket (on the client)
+ * or point to a global workqueue (on the server)
+ */
+ struct workqueue_struct *workqueue;
+
+ struct work_struct disconnect_work;
/* RDMA related */
struct {
struct rdma_cm_id *cm_id;
+ /*
+ * This is for iWarp MPA v1
+ */
+ bool legacy_iwarp;
} rdma;
/* IB verbs related */
@@ -40,6 +120,15 @@ struct smbdirect_socket {
struct smbdirect_socket_parameters parameters;
/*
+ * The state for keepalive and timeout handling
+ */
+ struct {
+ enum smbdirect_keepalive_status keepalive;
+ struct work_struct immediate_work;
+ struct delayed_work timer_work;
+ } idle;
+
+ /*
* The state for posted send buffers
*/
struct {
@@ -51,6 +140,29 @@ struct smbdirect_socket {
struct kmem_cache *cache;
mempool_t *pool;
} mem;
+
+ /*
+ * The credit state for the send side
+ */
+ struct {
+ atomic_t count;
+ wait_queue_head_t wait_queue;
+ } credits;
+
+ /*
+ * The state about posted/pending sends
+ */
+ struct {
+ atomic_t count;
+ /*
+ * woken when count is decremented
+ */
+ wait_queue_head_t dec_wait_queue;
+ /*
+ * woken when count reached zero
+ */
+ wait_queue_head_t zero_wait_queue;
+ } pending;
} send_io;
/*
@@ -85,6 +197,23 @@ struct smbdirect_socket {
} free;
/*
+ * The state for posted recv_io messages
+ * and the refill work struct.
+ */
+ struct {
+ atomic_t count;
+ struct work_struct refill_work;
+ } posted;
+
+ /*
+ * The credit state for the recv side
+ */
+ struct {
+ u16 target;
+ atomic_t count;
+ } credits;
+
+ /*
* The list of arrived non-empty smbdirect_recv_io
* structures
*
@@ -110,8 +239,137 @@ struct smbdirect_socket {
bool full_packet_received;
} reassembly;
} recv_io;
+
+ /*
+ * The state for Memory registrations on the client
+ */
+ struct {
+ enum ib_mr_type type;
+
+ /*
+ * The list of free smbdirect_mr_io
+ * structures
+ */
+ struct {
+ struct list_head list;
+ spinlock_t lock;
+ } all;
+
+ /*
+ * The number of available MRs ready for memory registration
+ */
+ struct {
+ atomic_t count;
+ wait_queue_head_t wait_queue;
+ } ready;
+
+ /*
+ * The number of used MRs
+ */
+ struct {
+ atomic_t count;
+ } used;
+
+ struct work_struct recovery_work;
+
+ /* Used by transport to wait until all MRs are returned */
+ struct {
+ wait_queue_head_t wait_queue;
+ } cleanup;
+ } mr_io;
+
+ /*
+ * The state for RDMA read/write requests on the server
+ */
+ struct {
+ /*
+ * The credit state for the send side
+ */
+ struct {
+ /*
+ * The maximum number of rw credits
+ */
+ size_t max;
+ /*
+ * The number of pages per credit
+ */
+ size_t num_pages;
+ atomic_t count;
+ wait_queue_head_t wait_queue;
+ } credits;
+ } rw_io;
+
+ /*
+ * For debug purposes
+ */
+ struct {
+ u64 get_receive_buffer;
+ u64 put_receive_buffer;
+ u64 enqueue_reassembly_queue;
+ u64 dequeue_reassembly_queue;
+ u64 send_empty;
+ } statistics;
};
+static void __smbdirect_socket_disabled_work(struct work_struct *work)
+{
+ /*
+ * Should never be called as disable_[delayed_]work_sync() was used.
+ */
+ WARN_ON_ONCE(1);
+}
+
+static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
+{
+ /*
+ * This also sets status = SMBDIRECT_SOCKET_CREATED
+ */
+ BUILD_BUG_ON(SMBDIRECT_SOCKET_CREATED != 0);
+ memset(sc, 0, sizeof(*sc));
+
+ init_waitqueue_head(&sc->status_wait);
+
+ INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work);
+ disable_work_sync(&sc->disconnect_work);
+
+ INIT_WORK(&sc->idle.immediate_work, __smbdirect_socket_disabled_work);
+ disable_work_sync(&sc->idle.immediate_work);
+ INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
+ disable_delayed_work_sync(&sc->idle.timer_work);
+
+ atomic_set(&sc->send_io.credits.count, 0);
+ init_waitqueue_head(&sc->send_io.credits.wait_queue);
+
+ atomic_set(&sc->send_io.pending.count, 0);
+ init_waitqueue_head(&sc->send_io.pending.dec_wait_queue);
+ init_waitqueue_head(&sc->send_io.pending.zero_wait_queue);
+
+ INIT_LIST_HEAD(&sc->recv_io.free.list);
+ spin_lock_init(&sc->recv_io.free.lock);
+
+ atomic_set(&sc->recv_io.posted.count, 0);
+ INIT_WORK(&sc->recv_io.posted.refill_work, __smbdirect_socket_disabled_work);
+ disable_work_sync(&sc->recv_io.posted.refill_work);
+
+ atomic_set(&sc->recv_io.credits.count, 0);
+
+ INIT_LIST_HEAD(&sc->recv_io.reassembly.list);
+ spin_lock_init(&sc->recv_io.reassembly.lock);
+ init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
+
+ atomic_set(&sc->rw_io.credits.count, 0);
+ init_waitqueue_head(&sc->rw_io.credits.wait_queue);
+
+ spin_lock_init(&sc->mr_io.all.lock);
+ INIT_LIST_HEAD(&sc->mr_io.all.list);
+ atomic_set(&sc->mr_io.ready.count, 0);
+ init_waitqueue_head(&sc->mr_io.ready.wait_queue);
+ atomic_set(&sc->mr_io.used.count, 0);
+ INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work);
+ disable_work_sync(&sc->mr_io.recovery_work);
+ init_waitqueue_head(&sc->mr_io.cleanup.wait_queue);
+}
+
struct smbdirect_send_io {
struct smbdirect_socket *socket;
struct ib_cqe cqe;
@@ -136,6 +394,23 @@ struct smbdirect_send_io {
u8 packet[];
};
+struct smbdirect_send_batch {
+ /*
+ * List of smbdirect_send_io messages
+ */
+ struct list_head msg_list;
+ /*
+ * Number of list entries
+ */
+ size_t wr_cnt;
+
+ /*
+ * Possible remote key invalidation state
+ */
+ bool need_invalidate_rkey;
+ u32 remote_key;
+};
+
struct smbdirect_recv_io {
struct smbdirect_socket *socket;
struct ib_cqe cqe;
@@ -158,4 +433,44 @@ struct smbdirect_recv_io {
u8 packet[];
};
+enum smbdirect_mr_state {
+ SMBDIRECT_MR_READY,
+ SMBDIRECT_MR_REGISTERED,
+ SMBDIRECT_MR_INVALIDATED,
+ SMBDIRECT_MR_ERROR
+};
+
+struct smbdirect_mr_io {
+ struct smbdirect_socket *socket;
+ struct ib_cqe cqe;
+
+ struct list_head list;
+
+ enum smbdirect_mr_state state;
+ struct ib_mr *mr;
+ struct sg_table sgt;
+ enum dma_data_direction dir;
+ union {
+ struct ib_reg_wr wr;
+ struct ib_send_wr inv_wr;
+ };
+
+ bool need_invalidate;
+ struct completion invalidate_done;
+};
+
+struct smbdirect_rw_io {
+ struct smbdirect_socket *socket;
+ struct ib_cqe cqe;
+
+ struct list_head list;
+
+ int error;
+ struct completion *completion;
+
+ struct rdma_rw_ctx rdma_ctx;
+ struct sg_table sgt;
+ struct scatterlist sg_list[];
+};
+
#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 67c4f73398df..91a934411134 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -243,7 +243,7 @@ int ksmbd_conn_write(struct ksmbd_work *work)
int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
void *buf, unsigned int buflen,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len)
{
int ret = -EINVAL;
@@ -257,7 +257,7 @@ int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
void *buf, unsigned int buflen,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len)
{
int ret = -EINVAL;
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 2aa8084bb593..07b43634262a 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -19,6 +19,8 @@
#include "smb_common.h"
#include "ksmbd_work.h"
+struct smbdirect_buffer_descriptor_v1;
+
#define KSMBD_SOCKET_BACKLOG 16
enum {
@@ -133,11 +135,11 @@ struct ksmbd_transport_ops {
unsigned int remote_key);
int (*rdma_read)(struct ksmbd_transport *t,
void *buf, unsigned int len,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len);
int (*rdma_write)(struct ksmbd_transport *t,
void *buf, unsigned int len,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len);
void (*free_transport)(struct ksmbd_transport *kt);
};
@@ -163,11 +165,11 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
int ksmbd_conn_write(struct ksmbd_work *work);
int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
void *buf, unsigned int buflen,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len);
int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
void *buf, unsigned int buflen,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len);
void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 72b00ca6e455..4a71f46d7020 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void)
int ksmbd_workqueue_init(void)
{
- ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0);
+ ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0);
if (!ksmbd_wq)
return -ENOMEM;
return 0;
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 8c9c49c3a0a4..40420544cc25 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -365,6 +365,7 @@ static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl)
return;
}
+ pr_info("running\n");
WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING);
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0d92ce49aed7..0c069eff80b7 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -23,6 +23,7 @@
#include "asn1.h"
#include "connection.h"
#include "transport_ipc.h"
+#include "../common/smbdirect/smbdirect.h"
#include "transport_rdma.h"
#include "vfs.h"
#include "vfs_cache.h"
@@ -2951,18 +2952,19 @@ int smb2_open(struct ksmbd_work *work)
}
ksmbd_debug(SMB, "converted name = %s\n", name);
- if (strchr(name, ':')) {
- if (!test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_STREAMS)) {
- rc = -EBADF;
- goto err_out2;
- }
- rc = parse_stream_name(name, &stream_name, &s_type);
- if (rc < 0)
- goto err_out2;
- }
if (posix_ctxt == false) {
+ if (strchr(name, ':')) {
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STREAMS)) {
+ rc = -EBADF;
+ goto err_out2;
+ }
+ rc = parse_stream_name(name, &stream_name, &s_type);
+ if (rc < 0)
+ goto err_out2;
+ }
+
rc = ksmbd_validate_filename(name);
if (rc < 0)
goto err_out2;
@@ -3443,6 +3445,8 @@ int smb2_open(struct ksmbd_work *work)
fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
+ fp->is_posix_ctxt = posix_ctxt;
+
/* fp should be searchable through ksmbd_inode.m_fp_list
* after daccess, saccess, attrib_only, and stream are
* initialized.
@@ -5988,7 +5992,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (IS_ERR(new_name))
return PTR_ERR(new_name);
- if (strchr(new_name, ':')) {
+ if (fp->is_posix_ctxt == false && strchr(new_name, ':')) {
int s_type;
char *xattr_stream_name, *stream_name = NULL;
size_t xattr_stream_size;
@@ -6662,7 +6666,7 @@ out:
}
static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
__le32 Channel,
__le16 ChannelInfoLength)
{
@@ -6698,7 +6702,7 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
int err;
err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
- (struct smb2_buffer_desc_v1 *)
+ (struct smbdirect_buffer_descriptor_v1 *)
((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)),
le16_to_cpu(req->ReadChannelInfoLength));
if (err)
@@ -6758,7 +6762,11 @@ int smb2_read(struct ksmbd_work *work)
if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
req->Channel == SMB2_CHANNEL_RDMA_V1) {
is_rdma_channel = true;
- max_read_size = get_smbd_max_read_write_size();
+ max_read_size = get_smbd_max_read_write_size(work->conn->transport);
+ if (max_read_size == 0) {
+ err = -EINVAL;
+ goto out;
+ }
}
if (is_rdma_channel == true) {
@@ -6769,7 +6777,7 @@ int smb2_read(struct ksmbd_work *work)
goto out;
}
err = smb2_set_remote_key_for_rdma(work,
- (struct smb2_buffer_desc_v1 *)
+ (struct smbdirect_buffer_descriptor_v1 *)
((char *)req + ch_offset),
req->Channel,
req->ReadChannelInfoLength);
@@ -6964,7 +6972,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
return -ENOMEM;
ret = ksmbd_conn_rdma_read(work->conn, data_buf, length,
- (struct smb2_buffer_desc_v1 *)
+ (struct smbdirect_buffer_descriptor_v1 *)
((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)),
le16_to_cpu(req->WriteChannelInfoLength));
if (ret < 0) {
@@ -7016,7 +7024,11 @@ int smb2_write(struct ksmbd_work *work)
if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
is_rdma_channel = true;
- max_write_size = get_smbd_max_read_write_size();
+ max_write_size = get_smbd_max_read_write_size(work->conn->transport);
+ if (max_write_size == 0) {
+ err = -EINVAL;
+ goto out;
+ }
length = le32_to_cpu(req->RemainingBytes);
}
@@ -7029,7 +7041,7 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
err = smb2_set_remote_key_for_rdma(work,
- (struct smb2_buffer_desc_v1 *)
+ (struct smbdirect_buffer_descriptor_v1 *)
((char *)req + ch_offset),
req->Channel,
req->WriteChannelInfoLength);
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 16ae8a10490b..5163d5241b90 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -136,12 +136,6 @@ struct create_posix_rsp {
u8 SidBuffer[44];
} __packed;
-struct smb2_buffer_desc_v1 {
- __le64 offset;
- __le32 token;
- __le32 length;
-} __packed;
-
#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
struct smb_sockaddr_in {
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 5466aa8c39b1..9e644a0daf1c 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -23,18 +23,24 @@
#include "connection.h"
#include "smb_common.h"
#include "../common/smb2status.h"
+#include "../common/smbdirect/smbdirect.h"
+#include "../common/smbdirect/smbdirect_pdu.h"
+#include "../common/smbdirect/smbdirect_socket.h"
#include "transport_rdma.h"
#define SMB_DIRECT_PORT_IWARP 5445
#define SMB_DIRECT_PORT_INFINIBAND 445
-#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100)
+#define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1)
-/* SMB_DIRECT negotiation timeout in seconds */
-#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120
+/* SMB_DIRECT negotiation timeout (for the server) in seconds */
+#define SMB_DIRECT_NEGOTIATE_TIMEOUT 5
-#define SMB_DIRECT_MAX_SEND_SGES 6
-#define SMB_DIRECT_MAX_RECV_SGES 1
+/* The timeout to wait for a keepalive message from peer in seconds */
+#define SMB_DIRECT_KEEPALIVE_SEND_INTERVAL 120
+
+/* The timeout to wait for a keepalive message from peer in seconds */
+#define SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT 5
/*
* Default maximum number of RDMA read/write outstanding on this connection
@@ -87,131 +93,38 @@ static struct smb_direct_listener {
static struct workqueue_struct *smb_direct_wq;
-enum smb_direct_status {
- SMB_DIRECT_CS_NEW = 0,
- SMB_DIRECT_CS_CONNECTED,
- SMB_DIRECT_CS_DISCONNECTING,
- SMB_DIRECT_CS_DISCONNECTED,
-};
-
struct smb_direct_transport {
struct ksmbd_transport transport;
- enum smb_direct_status status;
- bool full_packet_received;
- wait_queue_head_t wait_status;
-
- struct rdma_cm_id *cm_id;
- struct ib_cq *send_cq;
- struct ib_cq *recv_cq;
- struct ib_pd *pd;
- struct ib_qp *qp;
-
- int max_send_size;
- int max_recv_size;
- int max_fragmented_send_size;
- int max_fragmented_recv_size;
- int max_rdma_rw_size;
-
- spinlock_t reassembly_queue_lock;
- struct list_head reassembly_queue;
- int reassembly_data_length;
- int reassembly_queue_length;
- int first_entry_offset;
- wait_queue_head_t wait_reassembly_queue;
-
- spinlock_t receive_credit_lock;
- int recv_credits;
- int count_avail_recvmsg;
- int recv_credit_max;
- int recv_credit_target;
-
- spinlock_t recvmsg_queue_lock;
- struct list_head recvmsg_queue;
-
- int send_credit_target;
- atomic_t send_credits;
- spinlock_t lock_new_recv_credits;
- int new_recv_credits;
- int max_rw_credits;
- int pages_per_rw_credit;
- atomic_t rw_credits;
-
- wait_queue_head_t wait_send_credits;
- wait_queue_head_t wait_rw_credits;
-
- mempool_t *sendmsg_mempool;
- struct kmem_cache *sendmsg_cache;
- mempool_t *recvmsg_mempool;
- struct kmem_cache *recvmsg_cache;
-
- wait_queue_head_t wait_send_pending;
- atomic_t send_pending;
-
- struct delayed_work post_recv_credits_work;
- struct work_struct send_immediate_work;
- struct work_struct disconnect_work;
-
- bool negotiation_requested;
+ struct smbdirect_socket socket;
};
-#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport))
-#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \
+#define KSMBD_TRANS(t) (&(t)->transport)
+#define SMBD_TRANS(t) (container_of(t, \
struct smb_direct_transport, transport))
-enum {
- SMB_DIRECT_MSG_NEGOTIATE_REQ = 0,
- SMB_DIRECT_MSG_DATA_TRANSFER
-};
static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops;
-struct smb_direct_send_ctx {
- struct list_head msg_list;
- int wr_cnt;
- bool need_invalidate_rkey;
- unsigned int remote_key;
-};
-
-struct smb_direct_sendmsg {
- struct smb_direct_transport *transport;
- struct ib_send_wr wr;
- struct list_head list;
- int num_sge;
- struct ib_sge sge[SMB_DIRECT_MAX_SEND_SGES];
- struct ib_cqe cqe;
- u8 packet[];
-};
-
-struct smb_direct_recvmsg {
- struct smb_direct_transport *transport;
- struct list_head list;
- int type;
- struct ib_sge sge;
- struct ib_cqe cqe;
- bool first_segment;
- u8 packet[];
-};
-
-struct smb_direct_rdma_rw_msg {
- struct smb_direct_transport *t;
- struct ib_cqe cqe;
- int status;
- struct completion *completion;
- struct list_head list;
- struct rdma_rw_ctx rw_ctx;
- struct sg_table sgt;
- struct scatterlist sg_list[];
-};
-
void init_smbd_max_io_size(unsigned int sz)
{
sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
smb_direct_max_read_write_size = sz;
}
-unsigned int get_smbd_max_read_write_size(void)
+unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt)
{
- return smb_direct_max_read_write_size;
+ struct smb_direct_transport *t;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
+
+ if (kt->ops != &ksmbd_smb_direct_transport_ops)
+ return 0;
+
+ t = SMBD_TRANS(kt);
+ sc = &t->socket;
+ sp = &sc->parameters;
+
+ return sp->max_read_write_size;
}
static inline int get_buf_page_count(void *buf, int size)
@@ -220,71 +133,65 @@ static inline int get_buf_page_count(void *buf, int size)
(uintptr_t)buf / PAGE_SIZE;
}
-static void smb_direct_destroy_pools(struct smb_direct_transport *transport);
+static void smb_direct_destroy_pools(struct smbdirect_socket *sc);
static void smb_direct_post_recv_credits(struct work_struct *work);
-static int smb_direct_post_send_data(struct smb_direct_transport *t,
- struct smb_direct_send_ctx *send_ctx,
+static int smb_direct_post_send_data(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx,
struct kvec *iov, int niov,
int remaining_data_length);
-static inline struct smb_direct_transport *
-smb_trans_direct_transfort(struct ksmbd_transport *t)
-{
- return container_of(t, struct smb_direct_transport, transport);
-}
-
static inline void
-*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg)
+*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg)
{
return (void *)recvmsg->packet;
}
-static inline bool is_receive_credit_post_required(int receive_credits,
- int avail_recvmsg_count)
-{
- return receive_credits <= (smb_direct_receive_credit_max >> 3) &&
- avail_recvmsg_count >= (receive_credits >> 2);
-}
-
static struct
-smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t)
+smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc)
{
- struct smb_direct_recvmsg *recvmsg = NULL;
+ struct smbdirect_recv_io *recvmsg = NULL;
+ unsigned long flags;
- spin_lock(&t->recvmsg_queue_lock);
- if (!list_empty(&t->recvmsg_queue)) {
- recvmsg = list_first_entry(&t->recvmsg_queue,
- struct smb_direct_recvmsg,
+ spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+ if (!list_empty(&sc->recv_io.free.list)) {
+ recvmsg = list_first_entry(&sc->recv_io.free.list,
+ struct smbdirect_recv_io,
list);
list_del(&recvmsg->list);
}
- spin_unlock(&t->recvmsg_queue_lock);
+ spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
return recvmsg;
}
-static void put_recvmsg(struct smb_direct_transport *t,
- struct smb_direct_recvmsg *recvmsg)
+static void put_recvmsg(struct smbdirect_socket *sc,
+ struct smbdirect_recv_io *recvmsg)
{
+ unsigned long flags;
+
if (likely(recvmsg->sge.length != 0)) {
- ib_dma_unmap_single(t->cm_id->device,
+ ib_dma_unmap_single(sc->ib.dev,
recvmsg->sge.addr,
recvmsg->sge.length,
DMA_FROM_DEVICE);
recvmsg->sge.length = 0;
}
- spin_lock(&t->recvmsg_queue_lock);
- list_add(&recvmsg->list, &t->recvmsg_queue);
- spin_unlock(&t->recvmsg_queue_lock);
+ spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+ list_add(&recvmsg->list, &sc->recv_io.free.list);
+ spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
+
+ queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
}
-static void enqueue_reassembly(struct smb_direct_transport *t,
- struct smb_direct_recvmsg *recvmsg,
+static void enqueue_reassembly(struct smbdirect_socket *sc,
+ struct smbdirect_recv_io *recvmsg,
int data_length)
{
- spin_lock(&t->reassembly_queue_lock);
- list_add_tail(&recvmsg->list, &t->reassembly_queue);
- t->reassembly_queue_length++;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list);
+ sc->recv_io.reassembly.queue_length++;
/*
* Make sure reassembly_data_length is updated after list and
* reassembly_queue_length are updated. On the dequeue side
@@ -292,85 +199,228 @@ static void enqueue_reassembly(struct smb_direct_transport *t,
* if reassembly_queue_length and list is up to date
*/
virt_wmb();
- t->reassembly_data_length += data_length;
- spin_unlock(&t->reassembly_queue_lock);
+ sc->recv_io.reassembly.data_length += data_length;
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
}
-static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t)
+static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc)
{
- if (!list_empty(&t->reassembly_queue))
- return list_first_entry(&t->reassembly_queue,
- struct smb_direct_recvmsg, list);
+ if (!list_empty(&sc->recv_io.reassembly.list))
+ return list_first_entry(&sc->recv_io.reassembly.list,
+ struct smbdirect_recv_io, list);
else
return NULL;
}
+static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc)
+{
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ wake_up_all(&sc->status_wait);
+ wake_up_all(&sc->send_io.credits.wait_queue);
+ wake_up_all(&sc->send_io.pending.zero_wait_queue);
+ wake_up_all(&sc->recv_io.reassembly.wait_queue);
+ wake_up_all(&sc->rw_io.credits.wait_queue);
+}
+
static void smb_direct_disconnect_rdma_work(struct work_struct *work)
{
- struct smb_direct_transport *t =
- container_of(work, struct smb_direct_transport,
- disconnect_work);
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, disconnect_work);
- if (t->status == SMB_DIRECT_CS_CONNECTED) {
- t->status = SMB_DIRECT_CS_DISCONNECTING;
- rdma_disconnect(t->cm_id);
+ /*
+ * make sure this and other work is not queued again
+ * but here we don't block and avoid
+ * disable[_delayed]_work_sync()
+ */
+ disable_work(&sc->disconnect_work);
+ disable_work(&sc->recv_io.posted.refill_work);
+ disable_delayed_work(&sc->idle.timer_work);
+ disable_work(&sc->idle.immediate_work);
+
+ if (sc->first_error == 0)
+ sc->first_error = -ECONNABORTED;
+
+ switch (sc->status) {
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ case SMBDIRECT_SOCKET_CONNECTED:
+ case SMBDIRECT_SOCKET_ERROR:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
+ rdma_disconnect(sc->rdma.cm_id);
+ break;
+
+ case SMBDIRECT_SOCKET_CREATED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ /*
+ * rdma_accept() never reached
+ * RDMA_CM_EVENT_ESTABLISHED
+ */
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ case SMBDIRECT_SOCKET_DESTROYED:
+ break;
}
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ smb_direct_disconnect_wake_up_all(sc);
}
static void
-smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t)
+smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc)
{
- if (t->status == SMB_DIRECT_CS_CONNECTED)
- queue_work(smb_direct_wq, &t->disconnect_work);
+ /*
+ * make sure other work (than disconnect_work) is
+ * not queued again but here we don't block and avoid
+ * disable[_delayed]_work_sync()
+ */
+ disable_work(&sc->recv_io.posted.refill_work);
+ disable_work(&sc->idle.immediate_work);
+ disable_delayed_work(&sc->idle.timer_work);
+
+ if (sc->first_error == 0)
+ sc->first_error = -ECONNABORTED;
+
+ switch (sc->status) {
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
+ case SMBDIRECT_SOCKET_ERROR:
+ case SMBDIRECT_SOCKET_DISCONNECTING:
+ case SMBDIRECT_SOCKET_DISCONNECTED:
+ case SMBDIRECT_SOCKET_DESTROYED:
+ /*
+ * Keep the current error status
+ */
+ break;
+
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
+ case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
+ case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
+ case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+ break;
+
+ case SMBDIRECT_SOCKET_CREATED:
+ case SMBDIRECT_SOCKET_CONNECTED:
+ sc->status = SMBDIRECT_SOCKET_ERROR;
+ break;
+ }
+
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ */
+ smb_direct_disconnect_wake_up_all(sc);
+
+ queue_work(sc->workqueue, &sc->disconnect_work);
}
static void smb_direct_send_immediate_work(struct work_struct *work)
{
- struct smb_direct_transport *t = container_of(work,
- struct smb_direct_transport, send_immediate_work);
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, idle.immediate_work);
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return;
+
+ smb_direct_post_send_data(sc, NULL, NULL, 0, 0);
+}
+
+static void smb_direct_idle_connection_timer(struct work_struct *work)
+{
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, idle.timer_work.work);
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+ if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) {
+ smb_direct_disconnect_rdma_connection(sc);
+ return;
+ }
- if (t->status != SMB_DIRECT_CS_CONNECTED)
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return;
- smb_direct_post_send_data(t, NULL, NULL, 0, 0);
+ /*
+ * Now use the keepalive timeout (instead of keepalive interval)
+ * in order to wait for a response
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_timeout_msec));
+ queue_work(sc->workqueue, &sc->idle.immediate_work);
}
static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
{
struct smb_direct_transport *t;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
struct ksmbd_conn *conn;
t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP);
if (!t)
return NULL;
+ sc = &t->socket;
+ smbdirect_socket_init(sc);
+ sp = &sc->parameters;
- t->cm_id = cm_id;
- cm_id->context = t;
-
- t->status = SMB_DIRECT_CS_NEW;
- init_waitqueue_head(&t->wait_status);
+ sc->workqueue = smb_direct_wq;
- spin_lock_init(&t->reassembly_queue_lock);
- INIT_LIST_HEAD(&t->reassembly_queue);
- t->reassembly_data_length = 0;
- t->reassembly_queue_length = 0;
- init_waitqueue_head(&t->wait_reassembly_queue);
- init_waitqueue_head(&t->wait_send_credits);
- init_waitqueue_head(&t->wait_rw_credits);
+ INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work);
- spin_lock_init(&t->receive_credit_lock);
- spin_lock_init(&t->recvmsg_queue_lock);
- INIT_LIST_HEAD(&t->recvmsg_queue);
+ sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000;
+ sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH;
+ sp->responder_resources = 1;
+ sp->recv_credit_max = smb_direct_receive_credit_max;
+ sp->send_credit_target = smb_direct_send_credit_target;
+ sp->max_send_size = smb_direct_max_send_size;
+ sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
+ sp->max_recv_size = smb_direct_max_receive_size;
+ sp->max_read_write_size = smb_direct_max_read_write_size;
+ sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000;
+ sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000;
- init_waitqueue_head(&t->wait_send_pending);
- atomic_set(&t->send_pending, 0);
+ sc->rdma.cm_id = cm_id;
+ cm_id->context = sc;
- spin_lock_init(&t->lock_new_recv_credits);
+ sc->ib.dev = sc->rdma.cm_id->device;
- INIT_DELAYED_WORK(&t->post_recv_credits_work,
- smb_direct_post_recv_credits);
- INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work);
- INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work);
+ INIT_WORK(&sc->recv_io.posted.refill_work,
+ smb_direct_post_recv_credits);
+ INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work);
+ INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer);
conn = ksmbd_conn_alloc();
if (!conn)
@@ -391,89 +441,104 @@ static void smb_direct_free_transport(struct ksmbd_transport *kt)
static void free_transport(struct smb_direct_transport *t)
{
- struct smb_direct_recvmsg *recvmsg;
+ struct smbdirect_socket *sc = &t->socket;
+ struct smbdirect_recv_io *recvmsg;
- wake_up_interruptible(&t->wait_send_credits);
+ disable_work_sync(&sc->disconnect_work);
+ if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) {
+ smb_direct_disconnect_rdma_work(&sc->disconnect_work);
+ wait_event_interruptible(sc->status_wait,
+ sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
+ }
- ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n");
- wait_event(t->wait_send_pending,
- atomic_read(&t->send_pending) == 0);
+ /*
+ * Wake up all waiters in all wait queues
+ * in order to notice the broken connection.
+ *
+ * Most likely this was already called via
+ * smb_direct_disconnect_rdma_work(), but call it again...
+ */
+ smb_direct_disconnect_wake_up_all(sc);
- cancel_work_sync(&t->disconnect_work);
- cancel_delayed_work_sync(&t->post_recv_credits_work);
- cancel_work_sync(&t->send_immediate_work);
+ disable_work_sync(&sc->recv_io.posted.refill_work);
+ disable_delayed_work_sync(&sc->idle.timer_work);
+ disable_work_sync(&sc->idle.immediate_work);
- if (t->qp) {
- ib_drain_qp(t->qp);
- ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
- t->qp = NULL;
- rdma_destroy_qp(t->cm_id);
+ if (sc->ib.qp) {
+ ib_drain_qp(sc->ib.qp);
+ ib_mr_pool_destroy(sc->ib.qp, &sc->ib.qp->rdma_mrs);
+ sc->ib.qp = NULL;
+ rdma_destroy_qp(sc->rdma.cm_id);
}
ksmbd_debug(RDMA, "drain the reassembly queue\n");
do {
- spin_lock(&t->reassembly_queue_lock);
- recvmsg = get_first_reassembly(t);
+ unsigned long flags;
+
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ recvmsg = get_first_reassembly(sc);
if (recvmsg) {
list_del(&recvmsg->list);
- spin_unlock(&t->reassembly_queue_lock);
- put_recvmsg(t, recvmsg);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ put_recvmsg(sc, recvmsg);
} else {
- spin_unlock(&t->reassembly_queue_lock);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
}
} while (recvmsg);
- t->reassembly_data_length = 0;
-
- if (t->send_cq)
- ib_free_cq(t->send_cq);
- if (t->recv_cq)
- ib_free_cq(t->recv_cq);
- if (t->pd)
- ib_dealloc_pd(t->pd);
- if (t->cm_id)
- rdma_destroy_id(t->cm_id);
-
- smb_direct_destroy_pools(t);
+ sc->recv_io.reassembly.data_length = 0;
+
+ if (sc->ib.send_cq)
+ ib_free_cq(sc->ib.send_cq);
+ if (sc->ib.recv_cq)
+ ib_free_cq(sc->ib.recv_cq);
+ if (sc->ib.pd)
+ ib_dealloc_pd(sc->ib.pd);
+ if (sc->rdma.cm_id)
+ rdma_destroy_id(sc->rdma.cm_id);
+
+ smb_direct_destroy_pools(sc);
ksmbd_conn_free(KSMBD_TRANS(t)->conn);
}
-static struct smb_direct_sendmsg
-*smb_direct_alloc_sendmsg(struct smb_direct_transport *t)
+static struct smbdirect_send_io
+*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc)
{
- struct smb_direct_sendmsg *msg;
+ struct smbdirect_send_io *msg;
- msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP);
+ msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP);
if (!msg)
return ERR_PTR(-ENOMEM);
- msg->transport = t;
- INIT_LIST_HEAD(&msg->list);
+ msg->socket = sc;
+ INIT_LIST_HEAD(&msg->sibling_list);
msg->num_sge = 0;
return msg;
}
-static void smb_direct_free_sendmsg(struct smb_direct_transport *t,
- struct smb_direct_sendmsg *msg)
+static void smb_direct_free_sendmsg(struct smbdirect_socket *sc,
+ struct smbdirect_send_io *msg)
{
int i;
if (msg->num_sge > 0) {
- ib_dma_unmap_single(t->cm_id->device,
+ ib_dma_unmap_single(sc->ib.dev,
msg->sge[0].addr, msg->sge[0].length,
DMA_TO_DEVICE);
for (i = 1; i < msg->num_sge; i++)
- ib_dma_unmap_page(t->cm_id->device,
+ ib_dma_unmap_page(sc->ib.dev,
msg->sge[i].addr, msg->sge[i].length,
DMA_TO_DEVICE);
}
- mempool_free(msg, t->sendmsg_mempool);
+ mempool_free(msg, sc->send_io.mem.pool);
}
-static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
+static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg)
{
- switch (recvmsg->type) {
- case SMB_DIRECT_MSG_DATA_TRANSFER: {
- struct smb_direct_data_transfer *req =
- (struct smb_direct_data_transfer *)recvmsg->packet;
+ struct smbdirect_socket *sc = recvmsg->socket;
+
+ switch (sc->recv_io.expected) {
+ case SMBDIRECT_EXPECT_DATA_TRANSFER: {
+ struct smbdirect_data_transfer *req =
+ (struct smbdirect_data_transfer *)recvmsg->packet;
struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet
+ le32_to_cpu(req->data_offset));
ksmbd_debug(RDMA,
@@ -482,11 +547,11 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
le16_to_cpu(req->credits_requested),
req->data_length, req->remaining_data_length,
hdr->ProtocolId, hdr->Command);
- break;
+ return 0;
}
- case SMB_DIRECT_MSG_NEGOTIATE_REQ: {
- struct smb_direct_negotiate_req *req =
- (struct smb_direct_negotiate_req *)recvmsg->packet;
+ case SMBDIRECT_EXPECT_NEGOTIATE_REQ: {
+ struct smbdirect_negotiate_req *req =
+ (struct smbdirect_negotiate_req *)recvmsg->packet;
ksmbd_debug(RDMA,
"MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n",
le16_to_cpu(req->min_version),
@@ -504,29 +569,34 @@ static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg)
128 * 1024)
return -ECONNABORTED;
- break;
+ return 0;
}
- default:
- return -EINVAL;
+ case SMBDIRECT_EXPECT_NEGOTIATE_REP:
+ /* client only */
+ break;
}
- return 0;
+
+ /* This is an internal error */
+ return -EINVAL;
}
static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smb_direct_recvmsg *recvmsg;
- struct smb_direct_transport *t;
+ struct smbdirect_recv_io *recvmsg;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
- recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe);
- t = recvmsg->transport;
+ recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
+ sc = recvmsg->socket;
+ sp = &sc->parameters;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
- put_recvmsg(t, recvmsg);
+ put_recvmsg(sc, recvmsg);
if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_err("Recv error. status='%s (%d)' opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
- smb_direct_disconnect_rdma_connection(t);
+ smb_direct_disconnect_rdma_connection(sc);
}
return;
}
@@ -538,108 +608,128 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr,
recvmsg->sge.length, DMA_FROM_DEVICE);
- switch (recvmsg->type) {
- case SMB_DIRECT_MSG_NEGOTIATE_REQ:
- if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
+ /*
+ * Reset timer to the keepalive interval in
+ * order to trigger our next keepalive message.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_interval_msec));
+
+ switch (sc->recv_io.expected) {
+ case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
+ if (wc->byte_len < sizeof(struct smbdirect_negotiate_req)) {
+ put_recvmsg(sc, recvmsg);
+ smb_direct_disconnect_rdma_connection(sc);
return;
}
- t->negotiation_requested = true;
- t->full_packet_received = true;
- t->status = SMB_DIRECT_CS_CONNECTED;
- enqueue_reassembly(t, recvmsg, 0);
- wake_up_interruptible(&t->wait_status);
+ sc->recv_io.reassembly.full_packet_received = true;
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
+ enqueue_reassembly(sc, recvmsg, 0);
+ wake_up(&sc->status_wait);
return;
- case SMB_DIRECT_MSG_DATA_TRANSFER: {
- struct smb_direct_data_transfer *data_transfer =
- (struct smb_direct_data_transfer *)recvmsg->packet;
- unsigned int data_length;
- int avail_recvmsg_count, receive_credits;
+ case SMBDIRECT_EXPECT_DATA_TRANSFER: {
+ struct smbdirect_data_transfer *data_transfer =
+ (struct smbdirect_data_transfer *)recvmsg->packet;
+ u32 remaining_data_length, data_offset, data_length;
+ u16 old_recv_credit_target;
if (wc->byte_len <
- offsetof(struct smb_direct_data_transfer, padding)) {
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
+ offsetof(struct smbdirect_data_transfer, padding)) {
+ put_recvmsg(sc, recvmsg);
+ smb_direct_disconnect_rdma_connection(sc);
return;
}
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
data_length = le32_to_cpu(data_transfer->data_length);
- if (data_length) {
- if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
- (u64)data_length) {
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
- return;
- }
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+ if (wc->byte_len < data_offset ||
+ wc->byte_len < (u64)data_offset + data_length) {
+ put_recvmsg(sc, recvmsg);
+ smb_direct_disconnect_rdma_connection(sc);
+ return;
+ }
+ if (remaining_data_length > sp->max_fragmented_recv_size ||
+ data_length > sp->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length >
+ (u64)sp->max_fragmented_recv_size) {
+ put_recvmsg(sc, recvmsg);
+ smb_direct_disconnect_rdma_connection(sc);
+ return;
+ }
- if (t->full_packet_received)
+ if (data_length) {
+ if (sc->recv_io.reassembly.full_packet_received)
recvmsg->first_segment = true;
if (le32_to_cpu(data_transfer->remaining_data_length))
- t->full_packet_received = false;
+ sc->recv_io.reassembly.full_packet_received = false;
else
- t->full_packet_received = true;
-
- spin_lock(&t->receive_credit_lock);
- receive_credits = --(t->recv_credits);
- avail_recvmsg_count = t->count_avail_recvmsg;
- spin_unlock(&t->receive_credit_lock);
- } else {
- spin_lock(&t->receive_credit_lock);
- receive_credits = --(t->recv_credits);
- avail_recvmsg_count = ++(t->count_avail_recvmsg);
- spin_unlock(&t->receive_credit_lock);
+ sc->recv_io.reassembly.full_packet_received = true;
}
- t->recv_credit_target =
+ atomic_dec(&sc->recv_io.posted.count);
+ atomic_dec(&sc->recv_io.credits.count);
+
+ old_recv_credit_target = sc->recv_io.credits.target;
+ sc->recv_io.credits.target =
le16_to_cpu(data_transfer->credits_requested);
+ sc->recv_io.credits.target =
+ min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
+ sc->recv_io.credits.target =
+ max_t(u16, sc->recv_io.credits.target, 1);
atomic_add(le16_to_cpu(data_transfer->credits_granted),
- &t->send_credits);
+ &sc->send_io.credits.count);
if (le16_to_cpu(data_transfer->flags) &
- SMB_DIRECT_RESPONSE_REQUESTED)
- queue_work(smb_direct_wq, &t->send_immediate_work);
-
- if (atomic_read(&t->send_credits) > 0)
- wake_up_interruptible(&t->wait_send_credits);
+ SMBDIRECT_FLAG_RESPONSE_REQUESTED)
+ queue_work(sc->workqueue, &sc->idle.immediate_work);
- if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count))
- mod_delayed_work(smb_direct_wq,
- &t->post_recv_credits_work, 0);
+ if (atomic_read(&sc->send_io.credits.count) > 0)
+ wake_up(&sc->send_io.credits.wait_queue);
if (data_length) {
- enqueue_reassembly(t, recvmsg, (int)data_length);
- wake_up_interruptible(&t->wait_reassembly_queue);
+ if (sc->recv_io.credits.target > old_recv_credit_target)
+ queue_work(sc->workqueue, &sc->recv_io.posted.refill_work);
+
+ enqueue_reassembly(sc, recvmsg, (int)data_length);
+ wake_up(&sc->recv_io.reassembly.wait_queue);
} else
- put_recvmsg(t, recvmsg);
+ put_recvmsg(sc, recvmsg);
return;
}
+ case SMBDIRECT_EXPECT_NEGOTIATE_REP:
+ /* client only */
+ break;
}
/*
* This is an internal error!
*/
- WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER);
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
+ WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
+ put_recvmsg(sc, recvmsg);
+ smb_direct_disconnect_rdma_connection(sc);
}
-static int smb_direct_post_recv(struct smb_direct_transport *t,
- struct smb_direct_recvmsg *recvmsg)
+static int smb_direct_post_recv(struct smbdirect_socket *sc,
+ struct smbdirect_recv_io *recvmsg)
{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_recv_wr wr;
int ret;
- recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device,
- recvmsg->packet, t->max_recv_size,
+ recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev,
+ recvmsg->packet,
+ sp->max_recv_size,
DMA_FROM_DEVICE);
- ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr);
+ ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr);
if (ret)
return ret;
- recvmsg->sge.length = t->max_recv_size;
- recvmsg->sge.lkey = t->pd->local_dma_lkey;
+ recvmsg->sge.length = sp->max_recv_size;
+ recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey;
recvmsg->cqe.done = recv_done;
wr.wr_cqe = &recvmsg->cqe;
@@ -647,14 +737,14 @@ static int smb_direct_post_recv(struct smb_direct_transport *t,
wr.sg_list = &recvmsg->sge;
wr.num_sge = 1;
- ret = ib_post_recv(t->qp, &wr, NULL);
+ ret = ib_post_recv(sc->ib.qp, &wr, NULL);
if (ret) {
pr_err("Can't post recv: %d\n", ret);
- ib_dma_unmap_single(t->cm_id->device,
+ ib_dma_unmap_single(sc->ib.dev,
recvmsg->sge.addr, recvmsg->sge.length,
DMA_FROM_DEVICE);
recvmsg->sge.length = 0;
- smb_direct_disconnect_rdma_connection(t);
+ smb_direct_disconnect_rdma_connection(sc);
return ret;
}
return ret;
@@ -663,15 +753,16 @@ static int smb_direct_post_recv(struct smb_direct_transport *t,
static int smb_direct_read(struct ksmbd_transport *t, char *buf,
unsigned int size, int unused)
{
- struct smb_direct_recvmsg *recvmsg;
- struct smb_direct_data_transfer *data_transfer;
+ struct smbdirect_recv_io *recvmsg;
+ struct smbdirect_data_transfer *data_transfer;
int to_copy, to_read, data_read, offset;
u32 data_length, remaining_data_length, data_offset;
int rc;
- struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = &st->socket;
again:
- if (st->status != SMB_DIRECT_CS_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
pr_err("disconnected\n");
return -ENOTCONN;
}
@@ -681,9 +772,10 @@ again:
* the only one reading from the front of the queue. The transport
* may add more entries to the back of the queue at the same time
*/
- if (st->reassembly_data_length >= size) {
+ if (sc->recv_io.reassembly.data_length >= size) {
int queue_length;
int queue_removed = 0;
+ unsigned long flags;
/*
* Need to make sure reassembly_data_length is read before
@@ -693,13 +785,13 @@ again:
* updated in SOFTIRQ as more data is received
*/
virt_rmb();
- queue_length = st->reassembly_queue_length;
+ queue_length = sc->recv_io.reassembly.queue_length;
data_read = 0;
to_read = size;
- offset = st->first_entry_offset;
+ offset = sc->recv_io.reassembly.first_entry_offset;
while (data_read < size) {
- recvmsg = get_first_reassembly(st);
- data_transfer = smb_direct_recvmsg_payload(recvmsg);
+ recvmsg = get_first_reassembly(sc);
+ data_transfer = smbdirect_recv_io_payload(recvmsg);
data_length = le32_to_cpu(data_transfer->data_length);
remaining_data_length =
le32_to_cpu(data_transfer->remaining_data_length);
@@ -739,12 +831,12 @@ again:
if (queue_length) {
list_del(&recvmsg->list);
} else {
- spin_lock_irq(&st->reassembly_queue_lock);
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
list_del(&recvmsg->list);
- spin_unlock_irq(&st->reassembly_queue_lock);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
}
queue_removed++;
- put_recvmsg(st, recvmsg);
+ put_recvmsg(sc, recvmsg);
offset = 0;
} else {
offset += to_copy;
@@ -754,34 +846,24 @@ again:
data_read += to_copy;
}
- spin_lock_irq(&st->reassembly_queue_lock);
- st->reassembly_data_length -= data_read;
- st->reassembly_queue_length -= queue_removed;
- spin_unlock_irq(&st->reassembly_queue_lock);
-
- spin_lock(&st->receive_credit_lock);
- st->count_avail_recvmsg += queue_removed;
- if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) {
- spin_unlock(&st->receive_credit_lock);
- mod_delayed_work(smb_direct_wq,
- &st->post_recv_credits_work, 0);
- } else {
- spin_unlock(&st->receive_credit_lock);
- }
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ sc->recv_io.reassembly.data_length -= data_read;
+ sc->recv_io.reassembly.queue_length -= queue_removed;
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
- st->first_entry_offset = offset;
+ sc->recv_io.reassembly.first_entry_offset = offset;
ksmbd_debug(RDMA,
"returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
- data_read, st->reassembly_data_length,
- st->first_entry_offset);
+ data_read, sc->recv_io.reassembly.data_length,
+ sc->recv_io.reassembly.first_entry_offset);
read_rfc1002_done:
return data_read;
}
ksmbd_debug(RDMA, "wait_event on more data\n");
- rc = wait_event_interruptible(st->wait_reassembly_queue,
- st->reassembly_data_length >= size ||
- st->status != SMB_DIRECT_CS_CONNECTED);
+ rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue,
+ sc->recv_io.reassembly.data_length >= size ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc)
return -EINTR;
@@ -790,56 +872,44 @@ read_rfc1002_done:
static void smb_direct_post_recv_credits(struct work_struct *work)
{
- struct smb_direct_transport *t = container_of(work,
- struct smb_direct_transport, post_recv_credits_work.work);
- struct smb_direct_recvmsg *recvmsg;
- int receive_credits, credits = 0;
+ struct smbdirect_socket *sc =
+ container_of(work, struct smbdirect_socket, recv_io.posted.refill_work);
+ struct smbdirect_recv_io *recvmsg;
+ int credits = 0;
int ret;
- spin_lock(&t->receive_credit_lock);
- receive_credits = t->recv_credits;
- spin_unlock(&t->receive_credit_lock);
-
- if (receive_credits < t->recv_credit_target) {
+ if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) {
while (true) {
- recvmsg = get_free_recvmsg(t);
+ recvmsg = get_free_recvmsg(sc);
if (!recvmsg)
break;
- recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER;
recvmsg->first_segment = false;
- ret = smb_direct_post_recv(t, recvmsg);
+ ret = smb_direct_post_recv(sc, recvmsg);
if (ret) {
pr_err("Can't post recv: %d\n", ret);
- put_recvmsg(t, recvmsg);
+ put_recvmsg(sc, recvmsg);
break;
}
credits++;
+
+ atomic_inc(&sc->recv_io.posted.count);
}
}
- spin_lock(&t->receive_credit_lock);
- t->recv_credits += credits;
- t->count_avail_recvmsg -= credits;
- spin_unlock(&t->receive_credit_lock);
-
- spin_lock(&t->lock_new_recv_credits);
- t->new_recv_credits += credits;
- spin_unlock(&t->lock_new_recv_credits);
-
if (credits)
- queue_work(smb_direct_wq, &t->send_immediate_work);
+ queue_work(sc->workqueue, &sc->idle.immediate_work);
}
static void send_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smb_direct_sendmsg *sendmsg, *sibling;
- struct smb_direct_transport *t;
+ struct smbdirect_send_io *sendmsg, *sibling;
+ struct smbdirect_socket *sc;
struct list_head *pos, *prev, *end;
- sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe);
- t = sendmsg->transport;
+ sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
+ sc = sendmsg->socket;
ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
@@ -849,55 +919,78 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
pr_err("Send error. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
- smb_direct_disconnect_rdma_connection(t);
+ smb_direct_disconnect_rdma_connection(sc);
}
- if (atomic_dec_and_test(&t->send_pending))
- wake_up(&t->wait_send_pending);
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
/* iterate and free the list of messages in reverse. the list's head
* is invalid.
*/
- for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next;
+ for (pos = &sendmsg->sibling_list, prev = pos->prev, end = sendmsg->sibling_list.next;
prev != end; pos = prev, prev = prev->prev) {
- sibling = container_of(pos, struct smb_direct_sendmsg, list);
- smb_direct_free_sendmsg(t, sibling);
+ sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
+ smb_direct_free_sendmsg(sc, sibling);
}
- sibling = container_of(pos, struct smb_direct_sendmsg, list);
- smb_direct_free_sendmsg(t, sibling);
+ sibling = container_of(pos, struct smbdirect_send_io, sibling_list);
+ smb_direct_free_sendmsg(sc, sibling);
}
-static int manage_credits_prior_sending(struct smb_direct_transport *t)
+static int manage_credits_prior_sending(struct smbdirect_socket *sc)
{
int new_credits;
- spin_lock(&t->lock_new_recv_credits);
- new_credits = t->new_recv_credits;
- t->new_recv_credits = 0;
- spin_unlock(&t->lock_new_recv_credits);
+ if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target)
+ return 0;
+
+ new_credits = atomic_read(&sc->recv_io.posted.count);
+ if (new_credits == 0)
+ return 0;
+
+ new_credits -= atomic_read(&sc->recv_io.credits.count);
+ if (new_credits <= 0)
+ return 0;
+ atomic_add(new_credits, &sc->recv_io.credits.count);
return new_credits;
}
-static int smb_direct_post_send(struct smb_direct_transport *t,
+static int manage_keep_alive_before_sending(struct smbdirect_socket *sc)
+{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+
+ if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) {
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT;
+ /*
+ * Now use the keepalive timeout (instead of keepalive interval)
+ * in order to wait for a response
+ */
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->keepalive_timeout_msec));
+ return 1;
+ }
+ return 0;
+}
+
+static int smb_direct_post_send(struct smbdirect_socket *sc,
struct ib_send_wr *wr)
{
int ret;
- atomic_inc(&t->send_pending);
- ret = ib_post_send(t->qp, wr, NULL);
+ atomic_inc(&sc->send_io.pending.count);
+ ret = ib_post_send(sc->ib.qp, wr, NULL);
if (ret) {
pr_err("failed to post send: %d\n", ret);
- if (atomic_dec_and_test(&t->send_pending))
- wake_up(&t->wait_send_pending);
- smb_direct_disconnect_rdma_connection(t);
+ if (atomic_dec_and_test(&sc->send_io.pending.count))
+ wake_up(&sc->send_io.pending.zero_wait_queue);
+ smb_direct_disconnect_rdma_connection(sc);
}
return ret;
}
-static void smb_direct_send_ctx_init(struct smb_direct_transport *t,
- struct smb_direct_send_ctx *send_ctx,
+static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx,
bool need_invalidate_rkey,
unsigned int remote_key)
{
@@ -907,47 +1000,50 @@ static void smb_direct_send_ctx_init(struct smb_direct_transport *t,
send_ctx->remote_key = remote_key;
}
-static int smb_direct_flush_send_list(struct smb_direct_transport *t,
- struct smb_direct_send_ctx *send_ctx,
+static int smb_direct_flush_send_list(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx,
bool is_last)
{
- struct smb_direct_sendmsg *first, *last;
+ struct smbdirect_send_io *first, *last;
int ret;
if (list_empty(&send_ctx->msg_list))
return 0;
first = list_first_entry(&send_ctx->msg_list,
- struct smb_direct_sendmsg,
- list);
+ struct smbdirect_send_io,
+ sibling_list);
last = list_last_entry(&send_ctx->msg_list,
- struct smb_direct_sendmsg,
- list);
+ struct smbdirect_send_io,
+ sibling_list);
+
+ if (send_ctx->need_invalidate_rkey) {
+ first->wr.opcode = IB_WR_SEND_WITH_INV;
+ first->wr.ex.invalidate_rkey = send_ctx->remote_key;
+ send_ctx->need_invalidate_rkey = false;
+ send_ctx->remote_key = 0;
+ }
last->wr.send_flags = IB_SEND_SIGNALED;
last->wr.wr_cqe = &last->cqe;
- if (is_last && send_ctx->need_invalidate_rkey) {
- last->wr.opcode = IB_WR_SEND_WITH_INV;
- last->wr.ex.invalidate_rkey = send_ctx->remote_key;
- }
- ret = smb_direct_post_send(t, &first->wr);
+ ret = smb_direct_post_send(sc, &first->wr);
if (!ret) {
- smb_direct_send_ctx_init(t, send_ctx,
+ smb_direct_send_ctx_init(send_ctx,
send_ctx->need_invalidate_rkey,
send_ctx->remote_key);
} else {
- atomic_add(send_ctx->wr_cnt, &t->send_credits);
- wake_up(&t->wait_send_credits);
+ atomic_add(send_ctx->wr_cnt, &sc->send_io.credits.count);
+ wake_up(&sc->send_io.credits.wait_queue);
list_for_each_entry_safe(first, last, &send_ctx->msg_list,
- list) {
- smb_direct_free_sendmsg(t, first);
+ sibling_list) {
+ smb_direct_free_sendmsg(sc, first);
}
}
return ret;
}
-static int wait_for_credits(struct smb_direct_transport *t,
+static int wait_for_credits(struct smbdirect_socket *sc,
wait_queue_head_t *waitq, atomic_t *total_credits,
int needed)
{
@@ -960,61 +1056,68 @@ static int wait_for_credits(struct smb_direct_transport *t,
atomic_add(needed, total_credits);
ret = wait_event_interruptible(*waitq,
atomic_read(total_credits) >= needed ||
- t->status != SMB_DIRECT_CS_CONNECTED);
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (t->status != SMB_DIRECT_CS_CONNECTED)
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -ENOTCONN;
else if (ret < 0)
return ret;
} while (true);
}
-static int wait_for_send_credits(struct smb_direct_transport *t,
- struct smb_direct_send_ctx *send_ctx)
+static int wait_for_send_credits(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx)
{
int ret;
if (send_ctx &&
- (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) {
- ret = smb_direct_flush_send_list(t, send_ctx, false);
+ (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) {
+ ret = smb_direct_flush_send_list(sc, send_ctx, false);
if (ret)
return ret;
}
- return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1);
+ return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1);
}
-static int wait_for_rw_credits(struct smb_direct_transport *t, int credits)
+static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits)
{
- return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits);
+ return wait_for_credits(sc,
+ &sc->rw_io.credits.wait_queue,
+ &sc->rw_io.credits.count,
+ credits);
}
-static int calc_rw_credits(struct smb_direct_transport *t,
+static int calc_rw_credits(struct smbdirect_socket *sc,
char *buf, unsigned int len)
{
return DIV_ROUND_UP(get_buf_page_count(buf, len),
- t->pages_per_rw_credit);
+ sc->rw_io.credits.num_pages);
}
-static int smb_direct_create_header(struct smb_direct_transport *t,
+static int smb_direct_create_header(struct smbdirect_socket *sc,
int size, int remaining_data_length,
- struct smb_direct_sendmsg **sendmsg_out)
+ struct smbdirect_send_io **sendmsg_out)
{
- struct smb_direct_sendmsg *sendmsg;
- struct smb_direct_data_transfer *packet;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_send_io *sendmsg;
+ struct smbdirect_data_transfer *packet;
int header_length;
int ret;
- sendmsg = smb_direct_alloc_sendmsg(t);
+ sendmsg = smb_direct_alloc_sendmsg(sc);
if (IS_ERR(sendmsg))
return PTR_ERR(sendmsg);
/* Fill in the packet header */
- packet = (struct smb_direct_data_transfer *)sendmsg->packet;
- packet->credits_requested = cpu_to_le16(t->send_credit_target);
- packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t));
+ packet = (struct smbdirect_data_transfer *)sendmsg->packet;
+ packet->credits_requested = cpu_to_le16(sp->send_credit_target);
+ packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc));
packet->flags = 0;
+ if (manage_keep_alive_before_sending(sc))
+ packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
+
packet->reserved = 0;
if (!size)
packet->data_offset = 0;
@@ -1033,25 +1136,25 @@ static int smb_direct_create_header(struct smb_direct_transport *t,
le32_to_cpu(packet->remaining_data_length));
/* Map the packet to DMA */
- header_length = sizeof(struct smb_direct_data_transfer);
+ header_length = sizeof(struct smbdirect_data_transfer);
/* If this is a packet without payload, don't send padding */
if (!size)
header_length =
- offsetof(struct smb_direct_data_transfer, padding);
+ offsetof(struct smbdirect_data_transfer, padding);
- sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device,
+ sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
(void *)packet,
header_length,
DMA_TO_DEVICE);
- ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr);
+ ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr);
if (ret) {
- smb_direct_free_sendmsg(t, sendmsg);
+ smb_direct_free_sendmsg(sc, sendmsg);
return ret;
}
sendmsg->num_sge = 1;
sendmsg->sge[0].length = header_length;
- sendmsg->sge[0].lkey = t->pd->local_dma_lkey;
+ sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
*sendmsg_out = sendmsg;
return 0;
@@ -1101,14 +1204,14 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size,
return ib_dma_map_sg(device, sg_list, npages, dir);
}
-static int post_sendmsg(struct smb_direct_transport *t,
- struct smb_direct_send_ctx *send_ctx,
- struct smb_direct_sendmsg *msg)
+static int post_sendmsg(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx,
+ struct smbdirect_send_io *msg)
{
int i;
for (i = 0; i < msg->num_sge; i++)
- ib_dma_sync_single_for_device(t->cm_id->device,
+ ib_dma_sync_single_for_device(sc->ib.dev,
msg->sge[i].addr, msg->sge[i].length,
DMA_TO_DEVICE);
@@ -1122,34 +1225,34 @@ static int post_sendmsg(struct smb_direct_transport *t,
msg->wr.wr_cqe = NULL;
msg->wr.send_flags = 0;
if (!list_empty(&send_ctx->msg_list)) {
- struct smb_direct_sendmsg *last;
+ struct smbdirect_send_io *last;
last = list_last_entry(&send_ctx->msg_list,
- struct smb_direct_sendmsg,
- list);
+ struct smbdirect_send_io,
+ sibling_list);
last->wr.next = &msg->wr;
}
- list_add_tail(&msg->list, &send_ctx->msg_list);
+ list_add_tail(&msg->sibling_list, &send_ctx->msg_list);
send_ctx->wr_cnt++;
return 0;
}
msg->wr.wr_cqe = &msg->cqe;
msg->wr.send_flags = IB_SEND_SIGNALED;
- return smb_direct_post_send(t, &msg->wr);
+ return smb_direct_post_send(sc, &msg->wr);
}
-static int smb_direct_post_send_data(struct smb_direct_transport *t,
- struct smb_direct_send_ctx *send_ctx,
+static int smb_direct_post_send_data(struct smbdirect_socket *sc,
+ struct smbdirect_send_batch *send_ctx,
struct kvec *iov, int niov,
int remaining_data_length)
{
int i, j, ret;
- struct smb_direct_sendmsg *msg;
+ struct smbdirect_send_io *msg;
int data_length;
- struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1];
+ struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1];
- ret = wait_for_send_credits(t, send_ctx);
+ ret = wait_for_send_credits(sc, send_ctx);
if (ret)
return ret;
@@ -1157,10 +1260,10 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
for (i = 0; i < niov; i++)
data_length += iov[i].iov_len;
- ret = smb_direct_create_header(t, data_length, remaining_data_length,
+ ret = smb_direct_create_header(sc, data_length, remaining_data_length,
&msg);
if (ret) {
- atomic_inc(&t->send_credits);
+ atomic_inc(&sc->send_io.credits.count);
return ret;
}
@@ -1168,19 +1271,19 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
struct ib_sge *sge;
int sg_cnt;
- sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1);
- sg_cnt = get_mapped_sg_list(t->cm_id->device,
+ sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1);
+ sg_cnt = get_mapped_sg_list(sc->ib.dev,
iov[i].iov_base, iov[i].iov_len,
- sg, SMB_DIRECT_MAX_SEND_SGES - 1,
+ sg, SMBDIRECT_SEND_IO_MAX_SGE - 1,
DMA_TO_DEVICE);
if (sg_cnt <= 0) {
pr_err("failed to map buffer\n");
ret = -ENOMEM;
goto err;
- } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) {
+ } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) {
pr_err("buffer not fitted into sges\n");
ret = -E2BIG;
- ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt,
+ ib_dma_unmap_sg(sc->ib.dev, sg, sg_cnt,
DMA_TO_DEVICE);
goto err;
}
@@ -1189,18 +1292,18 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t,
sge = &msg->sge[msg->num_sge];
sge->addr = sg_dma_address(&sg[j]);
sge->length = sg_dma_len(&sg[j]);
- sge->lkey = t->pd->local_dma_lkey;
+ sge->lkey = sc->ib.pd->local_dma_lkey;
msg->num_sge++;
}
}
- ret = post_sendmsg(t, send_ctx, msg);
+ ret = post_sendmsg(sc, send_ctx, msg);
if (ret)
goto err;
return 0;
err:
- smb_direct_free_sendmsg(t, msg);
- atomic_inc(&t->send_credits);
+ smb_direct_free_sendmsg(sc, msg);
+ atomic_inc(&sc->send_io.credits.count);
return ret;
}
@@ -1208,79 +1311,133 @@ static int smb_direct_writev(struct ksmbd_transport *t,
struct kvec *iov, int niovs, int buflen,
bool need_invalidate, unsigned int remote_key)
{
- struct smb_direct_transport *st = smb_trans_direct_transfort(t);
- int remaining_data_length;
- int start, i, j;
- int max_iov_size = st->max_send_size -
- sizeof(struct smb_direct_data_transfer);
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = &st->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ size_t remaining_data_length;
+ size_t iov_idx;
+ size_t iov_ofs;
+ size_t max_iov_size = sp->max_send_size -
+ sizeof(struct smbdirect_data_transfer);
int ret;
- struct kvec vec;
- struct smb_direct_send_ctx send_ctx;
+ struct smbdirect_send_batch send_ctx;
+ int error = 0;
- if (st->status != SMB_DIRECT_CS_CONNECTED)
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -ENOTCONN;
//FIXME: skip RFC1002 header..
+ if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
+ return -EINVAL;
buflen -= 4;
+ iov_idx = 1;
+ iov_ofs = 0;
remaining_data_length = buflen;
ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
- smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
- start = i = 1;
- buflen = 0;
- while (true) {
- buflen += iov[i].iov_len;
- if (buflen > max_iov_size) {
- if (i > start) {
- remaining_data_length -=
- (buflen - iov[i].iov_len);
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
+ smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key);
+ while (remaining_data_length) {
+ struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */
+ size_t possible_bytes = max_iov_size;
+ size_t possible_vecs;
+ size_t bytes = 0;
+ size_t nvecs = 0;
+
+ /*
+ * For the last message remaining_data_length should be
+ * have been 0 already!
+ */
+ if (WARN_ON_ONCE(iov_idx >= niovs)) {
+ error = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * We have 2 factors which limit the arguments we pass
+ * to smb_direct_post_send_data():
+ *
+ * 1. The number of supported sges for the send,
+ * while one is reserved for the smbdirect header.
+ * And we currently need one SGE per page.
+ * 2. The number of negotiated payload bytes per send.
+ */
+ possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
+
+ while (iov_idx < niovs && possible_vecs && possible_bytes) {
+ struct kvec *v = &vecs[nvecs];
+ int page_count;
+
+ v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
+ v->iov_len = min_t(size_t,
+ iov[iov_idx].iov_len - iov_ofs,
+ possible_bytes);
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (page_count > possible_vecs) {
+ /*
+ * If the number of pages in the buffer
+ * is to much (because we currently require
+ * one SGE per page), we need to limit the
+ * length.
+ *
+ * We know possible_vecs is at least 1,
+ * so we always keep the first page.
+ *
+ * We need to calculate the number extra
+ * pages (epages) we can also keep.
+ *
+ * We calculate the number of bytes in the
+ * first page (fplen), this should never be
+ * larger than v->iov_len because page_count is
+ * at least 2, but adding a limitation feels
+ * better.
+ *
+ * Then we calculate the number of bytes (elen)
+ * we can keep for the extra pages.
+ */
+ size_t epages = possible_vecs - 1;
+ size_t fpofs = offset_in_page(v->iov_base);
+ size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
+ size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
+
+ v->iov_len = fplen + elen;
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (WARN_ON_ONCE(page_count > possible_vecs)) {
+ /*
+ * Something went wrong in the above
+ * logic...
+ */
+ error = -EINVAL;
goto done;
- } else {
- /* iov[start] is too big, break it */
- int nvec = (buflen + max_iov_size - 1) /
- max_iov_size;
-
- for (j = 0; j < nvec; j++) {
- vec.iov_base =
- (char *)iov[start].iov_base +
- j * max_iov_size;
- vec.iov_len =
- min_t(int, max_iov_size,
- buflen - max_iov_size * j);
- remaining_data_length -= vec.iov_len;
- ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
- remaining_data_length);
- if (ret)
- goto done;
}
- i++;
- if (i == niovs)
- break;
}
- start = i;
- buflen = 0;
- } else {
- i++;
- if (i == niovs) {
- /* send out all remaining vecs */
- remaining_data_length -= buflen;
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
- goto done;
- break;
+ possible_vecs -= page_count;
+ nvecs += 1;
+ possible_bytes -= v->iov_len;
+ bytes += v->iov_len;
+
+ iov_ofs += v->iov_len;
+ if (iov_ofs >= iov[iov_idx].iov_len) {
+ iov_idx += 1;
+ iov_ofs = 0;
}
}
+
+ remaining_data_length -= bytes;
+
+ ret = smb_direct_post_send_data(sc, &send_ctx,
+ vecs, nvecs,
+ remaining_data_length);
+ if (unlikely(ret)) {
+ error = ret;
+ goto done;
+ }
}
done:
- ret = smb_direct_flush_send_list(st, &send_ctx, true);
+ ret = smb_direct_flush_send_list(sc, &send_ctx, true);
+ if (unlikely(!ret && error))
+ ret = error;
/*
* As an optimization, we don't wait for individual I/O to finish
@@ -1289,16 +1446,22 @@ done:
* that means all the I/Os have been out and we are good to return
*/
- wait_event(st->wait_send_pending,
- atomic_read(&st->send_pending) == 0);
+ wait_event(sc->send_io.pending.zero_wait_queue,
+ atomic_read(&sc->send_io.pending.count) == 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0)
+ ret = -ENOTCONN;
+
return ret;
}
static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
- struct smb_direct_rdma_rw_msg *msg,
+ struct smbdirect_rw_io *msg,
enum dma_data_direction dir)
{
- rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ struct smbdirect_socket *sc = &t->socket;
+
+ rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
msg->sgt.sgl, msg->sgt.nents, dir);
sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
kfree(msg);
@@ -1307,16 +1470,16 @@ static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
enum dma_data_direction dir)
{
- struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe,
- struct smb_direct_rdma_rw_msg, cqe);
- struct smb_direct_transport *t = msg->t;
+ struct smbdirect_rw_io *msg =
+ container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe);
+ struct smbdirect_socket *sc = msg->socket;
if (wc->status != IB_WC_SUCCESS) {
- msg->status = -EIO;
+ msg->error = -EIO;
pr_err("read/write error. opcode = %d, status = %s(%d)\n",
wc->opcode, ib_wc_status_msg(wc->status), wc->status);
if (wc->status != IB_WC_WR_FLUSH_ERR)
- smb_direct_disconnect_rdma_connection(t);
+ smb_direct_disconnect_rdma_connection(sc);
}
complete(msg->completion);
@@ -1334,11 +1497,13 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc)
static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
void *buf, int buf_len,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len,
bool is_read)
{
- struct smb_direct_rdma_rw_msg *msg, *next_msg;
+ struct smbdirect_socket *sc = &t->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_rw_io *msg, *next_msg;
int i, ret;
DECLARE_COMPLETION_ONSTACK(completion);
struct ib_send_wr *first_wr;
@@ -1347,10 +1512,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
int credits_needed;
unsigned int desc_buf_len, desc_num = 0;
- if (t->status != SMB_DIRECT_CS_CONNECTED)
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -ENOTCONN;
- if (buf_len > t->max_rdma_rw_size)
+ if (buf_len > sp->max_read_write_size)
return -EINVAL;
/* calculate needed credits */
@@ -1370,7 +1535,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
buf_len = 0;
}
- credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len);
+ credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len);
desc_buf += desc_buf_len;
buf_len -= desc_buf_len;
desc_num++;
@@ -1379,7 +1544,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
str_read_write(is_read), buf_len, credits_needed);
- ret = wait_for_rw_credits(t, credits_needed);
+ ret = wait_for_rw_credits(sc, credits_needed);
if (ret < 0)
return ret;
@@ -1395,7 +1560,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
desc_buf_len = le32_to_cpu(desc[i].length);
- msg->t = t;
+ msg->socket = sc;
msg->cqe.done = is_read ? read_done : write_done;
msg->completion = &completion;
@@ -1417,7 +1582,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
goto out;
}
- ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
+ ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
msg->sgt.sgl,
get_buf_page_count(desc_buf, desc_buf_len),
0,
@@ -1438,96 +1603,94 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
/* concatenate work requests of rdma_rw_ctxs */
first_wr = NULL;
list_for_each_entry_reverse(msg, &msg_list, list) {
- first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
+ first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port,
&msg->cqe, first_wr);
}
- ret = ib_post_send(t->qp, first_wr, NULL);
+ ret = ib_post_send(sc->ib.qp, first_wr, NULL);
if (ret) {
pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
goto out;
}
- msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list);
+ msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list);
wait_for_completion(&completion);
- ret = msg->status;
+ ret = msg->error;
out:
list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
list_del(&msg->list);
smb_direct_free_rdma_rw_msg(t, msg,
is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
}
- atomic_add(credits_needed, &t->rw_credits);
- wake_up(&t->wait_rw_credits);
+ atomic_add(credits_needed, &sc->rw_io.credits.count);
+ wake_up(&sc->rw_io.credits.wait_queue);
return ret;
}
static int smb_direct_rdma_write(struct ksmbd_transport *t,
void *buf, unsigned int buflen,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len)
{
- return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+ return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen,
desc, desc_len, false);
}
static int smb_direct_rdma_read(struct ksmbd_transport *t,
void *buf, unsigned int buflen,
- struct smb2_buffer_desc_v1 *desc,
+ struct smbdirect_buffer_descriptor_v1 *desc,
unsigned int desc_len)
{
- return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+ return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen,
desc, desc_len, true);
}
static void smb_direct_disconnect(struct ksmbd_transport *t)
{
- struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = &st->socket;
- ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id);
+ ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id);
- smb_direct_disconnect_rdma_work(&st->disconnect_work);
- wait_event_interruptible(st->wait_status,
- st->status == SMB_DIRECT_CS_DISCONNECTED);
free_transport(st);
}
static void smb_direct_shutdown(struct ksmbd_transport *t)
{
- struct smb_direct_transport *st = smb_trans_direct_transfort(t);
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = &st->socket;
- ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id);
+ ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id);
- smb_direct_disconnect_rdma_work(&st->disconnect_work);
+ smb_direct_disconnect_rdma_work(&sc->disconnect_work);
}
static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
- struct smb_direct_transport *t = cm_id->context;
+ struct smbdirect_socket *sc = cm_id->context;
ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n",
cm_id, rdma_event_msg(event->event), event->event);
switch (event->event) {
case RDMA_CM_EVENT_ESTABLISHED: {
- t->status = SMB_DIRECT_CS_CONNECTED;
- wake_up_interruptible(&t->wait_status);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING);
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
+ wake_up(&sc->status_wait);
break;
}
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED: {
- ib_drain_qp(t->qp);
+ ib_drain_qp(sc->ib.qp);
- t->status = SMB_DIRECT_CS_DISCONNECTED;
- wake_up_interruptible(&t->wait_status);
- wake_up_interruptible(&t->wait_reassembly_queue);
- wake_up(&t->wait_send_credits);
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ smb_direct_disconnect_rdma_work(&sc->disconnect_work);
break;
}
case RDMA_CM_EVENT_CONNECT_ERROR: {
- t->status = SMB_DIRECT_CS_DISCONNECTED;
- wake_up_interruptible(&t->wait_status);
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ smb_direct_disconnect_rdma_work(&sc->disconnect_work);
break;
}
default:
@@ -1541,38 +1704,41 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
static void smb_direct_qpair_handler(struct ib_event *event, void *context)
{
- struct smb_direct_transport *t = context;
+ struct smbdirect_socket *sc = context;
ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n",
- t->cm_id, ib_event_msg(event->event), event->event);
+ sc->rdma.cm_id, ib_event_msg(event->event), event->event);
switch (event->event) {
case IB_EVENT_CQ_ERR:
case IB_EVENT_QP_FATAL:
- smb_direct_disconnect_rdma_connection(t);
+ smb_direct_disconnect_rdma_connection(sc);
break;
default:
break;
}
}
-static int smb_direct_send_negotiate_response(struct smb_direct_transport *t,
+static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc,
int failed)
{
- struct smb_direct_sendmsg *sendmsg;
- struct smb_direct_negotiate_resp *resp;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_send_io *sendmsg;
+ struct smbdirect_negotiate_resp *resp;
int ret;
- sendmsg = smb_direct_alloc_sendmsg(t);
+ sendmsg = smb_direct_alloc_sendmsg(sc);
if (IS_ERR(sendmsg))
return -ENOMEM;
- resp = (struct smb_direct_negotiate_resp *)sendmsg->packet;
+ resp = (struct smbdirect_negotiate_resp *)sendmsg->packet;
if (failed) {
memset(resp, 0, sizeof(*resp));
- resp->min_version = cpu_to_le16(0x0100);
- resp->max_version = cpu_to_le16(0x0100);
+ resp->min_version = SMB_DIRECT_VERSION_LE;
+ resp->max_version = SMB_DIRECT_VERSION_LE;
resp->status = STATUS_NOT_SUPPORTED;
+
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
} else {
resp->status = STATUS_SUCCESS;
resp->min_version = SMB_DIRECT_VERSION_LE;
@@ -1580,57 +1746,65 @@ static int smb_direct_send_negotiate_response(struct smb_direct_transport *t,
resp->negotiated_version = SMB_DIRECT_VERSION_LE;
resp->reserved = 0;
resp->credits_requested =
- cpu_to_le16(t->send_credit_target);
- resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t));
- resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size);
- resp->preferred_send_size = cpu_to_le32(t->max_send_size);
- resp->max_receive_size = cpu_to_le32(t->max_recv_size);
+ cpu_to_le16(sp->send_credit_target);
+ resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc));
+ resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size);
+ resp->preferred_send_size = cpu_to_le32(sp->max_send_size);
+ resp->max_receive_size = cpu_to_le32(sp->max_recv_size);
resp->max_fragmented_size =
- cpu_to_le32(t->max_fragmented_recv_size);
+ cpu_to_le32(sp->max_fragmented_recv_size);
+
+ sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
+ sc->status = SMBDIRECT_SOCKET_CONNECTED;
}
- sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device,
+ sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev,
(void *)resp, sizeof(*resp),
DMA_TO_DEVICE);
- ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr);
+ ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr);
if (ret) {
- smb_direct_free_sendmsg(t, sendmsg);
+ smb_direct_free_sendmsg(sc, sendmsg);
return ret;
}
sendmsg->num_sge = 1;
sendmsg->sge[0].length = sizeof(*resp);
- sendmsg->sge[0].lkey = t->pd->local_dma_lkey;
+ sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey;
- ret = post_sendmsg(t, NULL, sendmsg);
+ ret = post_sendmsg(sc, NULL, sendmsg);
if (ret) {
- smb_direct_free_sendmsg(t, sendmsg);
+ smb_direct_free_sendmsg(sc, sendmsg);
return ret;
}
- wait_event(t->wait_send_pending,
- atomic_read(&t->send_pending) == 0);
+ wait_event(sc->send_io.pending.zero_wait_queue,
+ atomic_read(&sc->send_io.pending.count) == 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
+ return -ENOTCONN;
+
return 0;
}
-static int smb_direct_accept_client(struct smb_direct_transport *t)
+static int smb_direct_accept_client(struct smbdirect_socket *sc)
{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct rdma_conn_param conn_param;
- struct ib_port_immutable port_immutable;
- u32 ird_ord_hdr[2];
+ __be32 ird_ord_hdr[2];
int ret;
+ /*
+ * smb_direct_handle_connect_request()
+ * already negotiated sp->initiator_depth
+ * and sp->responder_resources
+ */
memset(&conn_param, 0, sizeof(conn_param));
- conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom,
- SMB_DIRECT_CM_INITIATOR_DEPTH);
- conn_param.responder_resources = 0;
-
- t->cm_id->device->ops.get_port_immutable(t->cm_id->device,
- t->cm_id->port_num,
- &port_immutable);
- if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
- ird_ord_hdr[0] = conn_param.responder_resources;
- ird_ord_hdr[1] = 1;
+ conn_param.initiator_depth = sp->initiator_depth;
+ conn_param.responder_resources = sp->responder_resources;
+
+ if (sc->rdma.legacy_iwarp) {
+ ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
+ ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
conn_param.private_data = ird_ord_hdr;
conn_param.private_data_len = sizeof(ird_ord_hdr);
} else {
@@ -1641,7 +1815,17 @@ static int smb_direct_accept_client(struct smb_direct_transport *t)
conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY;
conn_param.flow_control = 0;
- ret = rdma_accept(t->cm_id, &conn_param);
+ /*
+ * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
+ * so that the timer will cause a disconnect.
+ */
+ sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
+ mod_delayed_work(sc->workqueue, &sc->idle.timer_work,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
+
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
+ ret = rdma_accept(sc->rdma.cm_id, &conn_param);
if (ret) {
pr_err("error at rdma_accept: %d\n", ret);
return ret;
@@ -1649,57 +1833,60 @@ static int smb_direct_accept_client(struct smb_direct_transport *t)
return 0;
}
-static int smb_direct_prepare_negotiation(struct smb_direct_transport *t)
+static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
{
+ struct smbdirect_recv_io *recvmsg;
int ret;
- struct smb_direct_recvmsg *recvmsg;
- recvmsg = get_free_recvmsg(t);
+ WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
+ sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
+
+ sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ;
+
+ recvmsg = get_free_recvmsg(sc);
if (!recvmsg)
return -ENOMEM;
- recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ;
- ret = smb_direct_post_recv(t, recvmsg);
+ ret = smb_direct_post_recv(sc, recvmsg);
if (ret) {
pr_err("Can't post recv: %d\n", ret);
goto out_err;
}
- t->negotiation_requested = false;
- ret = smb_direct_accept_client(t);
+ ret = smb_direct_accept_client(sc);
if (ret) {
pr_err("Can't accept client\n");
goto out_err;
}
- smb_direct_post_recv_credits(&t->post_recv_credits_work.work);
+ smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work);
return 0;
out_err:
- put_recvmsg(t, recvmsg);
+ put_recvmsg(sc, recvmsg);
return ret;
}
-static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t)
+static unsigned int smb_direct_get_max_fr_pages(struct smbdirect_socket *sc)
{
return min_t(unsigned int,
- t->cm_id->device->attrs.max_fast_reg_page_list_len,
+ sc->ib.dev->attrs.max_fast_reg_page_list_len,
256);
}
-static int smb_direct_init_params(struct smb_direct_transport *t,
+static int smb_direct_init_params(struct smbdirect_socket *sc,
struct ib_qp_cap *cap)
{
- struct ib_device *device = t->cm_id->device;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct ib_device *device = sc->ib.dev;
int max_send_sges, max_rw_wrs, max_send_wrs;
unsigned int max_sge_per_wr, wrs_per_credit;
/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
* SMB2 response could be mapped.
*/
- t->max_send_size = smb_direct_max_send_size;
- max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3;
- if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) {
- pr_err("max_send_size %d is too large\n", t->max_send_size);
+ max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3;
+ if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) {
+ pr_err("max_send_size %d is too large\n", sp->max_send_size);
return -EINVAL;
}
@@ -1710,10 +1897,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
* are needed for MR registration, RDMA R/W, local & remote
* MR invalidation.
*/
- t->max_rdma_rw_size = smb_direct_max_read_write_size;
- t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t);
- t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size,
- (t->pages_per_rw_credit - 1) *
+ sc->rw_io.credits.num_pages = smb_direct_get_max_fr_pages(sc);
+ sc->rw_io.credits.max = DIV_ROUND_UP(sp->max_read_write_size,
+ (sc->rw_io.credits.num_pages - 1) *
PAGE_SIZE);
max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
@@ -1721,233 +1907,244 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
max_send_sges);
wrs_per_credit = max_t(unsigned int, 4,
- DIV_ROUND_UP(t->pages_per_rw_credit,
+ DIV_ROUND_UP(sc->rw_io.credits.num_pages,
max_sge_per_wr) + 1);
- max_rw_wrs = t->max_rw_credits * wrs_per_credit;
+ max_rw_wrs = sc->rw_io.credits.max * wrs_per_credit;
- max_send_wrs = smb_direct_send_credit_target + max_rw_wrs;
+ max_send_wrs = sp->send_credit_target + max_rw_wrs;
if (max_send_wrs > device->attrs.max_cqe ||
max_send_wrs > device->attrs.max_qp_wr) {
pr_err("consider lowering send_credit_target = %d\n",
- smb_direct_send_credit_target);
+ sp->send_credit_target);
pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
device->attrs.max_cqe, device->attrs.max_qp_wr);
return -EINVAL;
}
- if (smb_direct_receive_credit_max > device->attrs.max_cqe ||
- smb_direct_receive_credit_max > device->attrs.max_qp_wr) {
+ if (sp->recv_credit_max > device->attrs.max_cqe ||
+ sp->recv_credit_max > device->attrs.max_qp_wr) {
pr_err("consider lowering receive_credit_max = %d\n",
- smb_direct_receive_credit_max);
+ sp->recv_credit_max);
pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n",
device->attrs.max_cqe, device->attrs.max_qp_wr);
return -EINVAL;
}
- if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
+ if (device->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE) {
+ pr_err("warning: device max_send_sge = %d too small\n",
+ device->attrs.max_send_sge);
+ return -EINVAL;
+ }
+ if (device->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
return -EINVAL;
}
- t->recv_credits = 0;
- t->count_avail_recvmsg = 0;
-
- t->recv_credit_max = smb_direct_receive_credit_max;
- t->recv_credit_target = 10;
- t->new_recv_credits = 0;
-
- t->send_credit_target = smb_direct_send_credit_target;
- atomic_set(&t->send_credits, 0);
- atomic_set(&t->rw_credits, t->max_rw_credits);
+ sc->recv_io.credits.target = 1;
- t->max_send_size = smb_direct_max_send_size;
- t->max_recv_size = smb_direct_max_receive_size;
- t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size;
+ atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
cap->max_send_wr = max_send_wrs;
- cap->max_recv_wr = t->recv_credit_max;
- cap->max_send_sge = max_sge_per_wr;
- cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
+ cap->max_recv_wr = sp->recv_credit_max;
+ cap->max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ cap->max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
cap->max_inline_data = 0;
- cap->max_rdma_ctxs = t->max_rw_credits;
+ cap->max_rdma_ctxs = sc->rw_io.credits.max;
return 0;
}
-static void smb_direct_destroy_pools(struct smb_direct_transport *t)
+static void smb_direct_destroy_pools(struct smbdirect_socket *sc)
{
- struct smb_direct_recvmsg *recvmsg;
+ struct smbdirect_recv_io *recvmsg;
- while ((recvmsg = get_free_recvmsg(t)))
- mempool_free(recvmsg, t->recvmsg_mempool);
+ while ((recvmsg = get_free_recvmsg(sc)))
+ mempool_free(recvmsg, sc->recv_io.mem.pool);
- mempool_destroy(t->recvmsg_mempool);
- t->recvmsg_mempool = NULL;
+ mempool_destroy(sc->recv_io.mem.pool);
+ sc->recv_io.mem.pool = NULL;
- kmem_cache_destroy(t->recvmsg_cache);
- t->recvmsg_cache = NULL;
+ kmem_cache_destroy(sc->recv_io.mem.cache);
+ sc->recv_io.mem.cache = NULL;
- mempool_destroy(t->sendmsg_mempool);
- t->sendmsg_mempool = NULL;
+ mempool_destroy(sc->send_io.mem.pool);
+ sc->send_io.mem.pool = NULL;
- kmem_cache_destroy(t->sendmsg_cache);
- t->sendmsg_cache = NULL;
+ kmem_cache_destroy(sc->send_io.mem.cache);
+ sc->send_io.mem.cache = NULL;
}
-static int smb_direct_create_pools(struct smb_direct_transport *t)
+static int smb_direct_create_pools(struct smbdirect_socket *sc)
{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
char name[80];
int i;
- struct smb_direct_recvmsg *recvmsg;
+ struct smbdirect_recv_io *recvmsg;
- snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t);
- t->sendmsg_cache = kmem_cache_create(name,
- sizeof(struct smb_direct_sendmsg) +
- sizeof(struct smb_direct_negotiate_resp),
+ snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc);
+ sc->send_io.mem.cache = kmem_cache_create(name,
+ sizeof(struct smbdirect_send_io) +
+ sizeof(struct smbdirect_negotiate_resp),
0, SLAB_HWCACHE_ALIGN, NULL);
- if (!t->sendmsg_cache)
+ if (!sc->send_io.mem.cache)
return -ENOMEM;
- t->sendmsg_mempool = mempool_create(t->send_credit_target,
+ sc->send_io.mem.pool = mempool_create(sp->send_credit_target,
mempool_alloc_slab, mempool_free_slab,
- t->sendmsg_cache);
- if (!t->sendmsg_mempool)
+ sc->send_io.mem.cache);
+ if (!sc->send_io.mem.pool)
goto err;
- snprintf(name, sizeof(name), "smb_direct_resp_%p", t);
- t->recvmsg_cache = kmem_cache_create(name,
- sizeof(struct smb_direct_recvmsg) +
- t->max_recv_size,
+ snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc);
+ sc->recv_io.mem.cache = kmem_cache_create(name,
+ sizeof(struct smbdirect_recv_io) +
+ sp->max_recv_size,
0, SLAB_HWCACHE_ALIGN, NULL);
- if (!t->recvmsg_cache)
+ if (!sc->recv_io.mem.cache)
goto err;
- t->recvmsg_mempool =
- mempool_create(t->recv_credit_max, mempool_alloc_slab,
- mempool_free_slab, t->recvmsg_cache);
- if (!t->recvmsg_mempool)
+ sc->recv_io.mem.pool =
+ mempool_create(sp->recv_credit_max, mempool_alloc_slab,
+ mempool_free_slab, sc->recv_io.mem.cache);
+ if (!sc->recv_io.mem.pool)
goto err;
- INIT_LIST_HEAD(&t->recvmsg_queue);
-
- for (i = 0; i < t->recv_credit_max; i++) {
- recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP);
+ for (i = 0; i < sp->recv_credit_max; i++) {
+ recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP);
if (!recvmsg)
goto err;
- recvmsg->transport = t;
+ recvmsg->socket = sc;
recvmsg->sge.length = 0;
- list_add(&recvmsg->list, &t->recvmsg_queue);
+ list_add(&recvmsg->list, &sc->recv_io.free.list);
}
- t->count_avail_recvmsg = t->recv_credit_max;
return 0;
err:
- smb_direct_destroy_pools(t);
+ smb_direct_destroy_pools(sc);
return -ENOMEM;
}
-static int smb_direct_create_qpair(struct smb_direct_transport *t,
+static int smb_direct_create_qpair(struct smbdirect_socket *sc,
struct ib_qp_cap *cap)
{
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
int ret;
struct ib_qp_init_attr qp_attr;
int pages_per_rw;
- t->pd = ib_alloc_pd(t->cm_id->device, 0);
- if (IS_ERR(t->pd)) {
+ sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+ if (IS_ERR(sc->ib.pd)) {
pr_err("Can't create RDMA PD\n");
- ret = PTR_ERR(t->pd);
- t->pd = NULL;
+ ret = PTR_ERR(sc->ib.pd);
+ sc->ib.pd = NULL;
return ret;
}
- t->send_cq = ib_alloc_cq(t->cm_id->device, t,
- smb_direct_send_credit_target + cap->max_rdma_ctxs,
- 0, IB_POLL_WORKQUEUE);
- if (IS_ERR(t->send_cq)) {
+ sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc,
+ sp->send_credit_target +
+ cap->max_rdma_ctxs,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(sc->ib.send_cq)) {
pr_err("Can't create RDMA send CQ\n");
- ret = PTR_ERR(t->send_cq);
- t->send_cq = NULL;
+ ret = PTR_ERR(sc->ib.send_cq);
+ sc->ib.send_cq = NULL;
goto err;
}
- t->recv_cq = ib_alloc_cq(t->cm_id->device, t,
- t->recv_credit_max, 0, IB_POLL_WORKQUEUE);
- if (IS_ERR(t->recv_cq)) {
+ sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc,
+ sp->recv_credit_max,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(sc->ib.recv_cq)) {
pr_err("Can't create RDMA recv CQ\n");
- ret = PTR_ERR(t->recv_cq);
- t->recv_cq = NULL;
+ ret = PTR_ERR(sc->ib.recv_cq);
+ sc->ib.recv_cq = NULL;
goto err;
}
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smb_direct_qpair_handler;
- qp_attr.qp_context = t;
+ qp_attr.qp_context = sc;
qp_attr.cap = *cap;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
- qp_attr.send_cq = t->send_cq;
- qp_attr.recv_cq = t->recv_cq;
+ qp_attr.send_cq = sc->ib.send_cq;
+ qp_attr.recv_cq = sc->ib.recv_cq;
qp_attr.port_num = ~0;
- ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr);
+ ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
if (ret) {
pr_err("Can't create RDMA QP: %d\n", ret);
goto err;
}
- t->qp = t->cm_id->qp;
- t->cm_id->event_handler = smb_direct_cm_handler;
+ sc->ib.qp = sc->rdma.cm_id->qp;
+ sc->rdma.cm_id->event_handler = smb_direct_cm_handler;
- pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
- if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) {
- ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs,
- t->max_rw_credits, IB_MR_TYPE_MEM_REG,
- t->pages_per_rw_credit, 0);
+ pages_per_rw = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE) + 1;
+ if (pages_per_rw > sc->ib.dev->attrs.max_sgl_rd) {
+ ret = ib_mr_pool_init(sc->ib.qp, &sc->ib.qp->rdma_mrs,
+ sc->rw_io.credits.max, IB_MR_TYPE_MEM_REG,
+ sc->rw_io.credits.num_pages, 0);
if (ret) {
- pr_err("failed to init mr pool count %d pages %d\n",
- t->max_rw_credits, t->pages_per_rw_credit);
+ pr_err("failed to init mr pool count %zu pages %zu\n",
+ sc->rw_io.credits.max, sc->rw_io.credits.num_pages);
goto err;
}
}
return 0;
err:
- if (t->qp) {
- t->qp = NULL;
- rdma_destroy_qp(t->cm_id);
+ if (sc->ib.qp) {
+ sc->ib.qp = NULL;
+ rdma_destroy_qp(sc->rdma.cm_id);
}
- if (t->recv_cq) {
- ib_destroy_cq(t->recv_cq);
- t->recv_cq = NULL;
+ if (sc->ib.recv_cq) {
+ ib_destroy_cq(sc->ib.recv_cq);
+ sc->ib.recv_cq = NULL;
}
- if (t->send_cq) {
- ib_destroy_cq(t->send_cq);
- t->send_cq = NULL;
+ if (sc->ib.send_cq) {
+ ib_destroy_cq(sc->ib.send_cq);
+ sc->ib.send_cq = NULL;
}
- if (t->pd) {
- ib_dealloc_pd(t->pd);
- t->pd = NULL;
+ if (sc->ib.pd) {
+ ib_dealloc_pd(sc->ib.pd);
+ sc->ib.pd = NULL;
}
return ret;
}
static int smb_direct_prepare(struct ksmbd_transport *t)
{
- struct smb_direct_transport *st = smb_trans_direct_transfort(t);
- struct smb_direct_recvmsg *recvmsg;
- struct smb_direct_negotiate_req *req;
+ struct smb_direct_transport *st = SMBD_TRANS(t);
+ struct smbdirect_socket *sc = &st->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_recv_io *recvmsg;
+ struct smbdirect_negotiate_req *req;
+ unsigned long flags;
int ret;
+ /*
+ * We are waiting to pass the following states:
+ *
+ * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED
+ * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING
+ * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
+ *
+ * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING
+ * in order to continue below.
+ *
+ * Everything else is unexpected and an error.
+ */
ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n");
- ret = wait_event_interruptible_timeout(st->wait_status,
- st->negotiation_requested ||
- st->status == SMB_DIRECT_CS_DISCONNECTED,
- SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ);
- if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED)
+ ret = wait_event_interruptible_timeout(sc->status_wait,
+ sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED &&
+ sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING &&
+ sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED,
+ msecs_to_jiffies(sp->negotiate_timeout_msec));
+ if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)
return ret < 0 ? ret : -ETIMEDOUT;
- recvmsg = get_first_reassembly(st);
+ recvmsg = get_first_reassembly(sc);
if (!recvmsg)
return -ECONNABORTED;
@@ -1955,51 +2152,54 @@ static int smb_direct_prepare(struct ksmbd_transport *t)
if (ret == -ECONNABORTED)
goto out;
- req = (struct smb_direct_negotiate_req *)recvmsg->packet;
- st->max_recv_size = min_t(int, st->max_recv_size,
+ req = (struct smbdirect_negotiate_req *)recvmsg->packet;
+ sp->max_recv_size = min_t(int, sp->max_recv_size,
le32_to_cpu(req->preferred_send_size));
- st->max_send_size = min_t(int, st->max_send_size,
+ sp->max_send_size = min_t(int, sp->max_send_size,
le32_to_cpu(req->max_receive_size));
- st->max_fragmented_send_size =
+ sp->max_fragmented_send_size =
le32_to_cpu(req->max_fragmented_size);
- st->max_fragmented_recv_size =
- (st->recv_credit_max * st->max_recv_size) / 2;
+ sp->max_fragmented_recv_size =
+ (sp->recv_credit_max * sp->max_recv_size) / 2;
+ sc->recv_io.credits.target = le16_to_cpu(req->credits_requested);
+ sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max);
+ sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1);
- ret = smb_direct_send_negotiate_response(st, ret);
+ ret = smb_direct_send_negotiate_response(sc, ret);
out:
- spin_lock_irq(&st->reassembly_queue_lock);
- st->reassembly_queue_length--;
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
+ sc->recv_io.reassembly.queue_length--;
list_del(&recvmsg->list);
- spin_unlock_irq(&st->reassembly_queue_lock);
- put_recvmsg(st, recvmsg);
+ spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
+ put_recvmsg(sc, recvmsg);
return ret;
}
-static int smb_direct_connect(struct smb_direct_transport *st)
+static int smb_direct_connect(struct smbdirect_socket *sc)
{
- int ret;
struct ib_qp_cap qp_cap;
+ int ret;
- ret = smb_direct_init_params(st, &qp_cap);
+ ret = smb_direct_init_params(sc, &qp_cap);
if (ret) {
pr_err("Can't configure RDMA parameters\n");
return ret;
}
- ret = smb_direct_create_pools(st);
+ ret = smb_direct_create_pools(sc);
if (ret) {
pr_err("Can't init RDMA pool: %d\n", ret);
return ret;
}
- ret = smb_direct_create_qpair(st, &qp_cap);
+ ret = smb_direct_create_qpair(sc, &qp_cap);
if (ret) {
pr_err("Can't accept RDMA client: %d\n", ret);
return ret;
}
- ret = smb_direct_prepare_negotiation(st);
+ ret = smb_direct_prepare_negotiation(sc);
if (ret) {
pr_err("Can't negotiate: %d\n", ret);
return ret;
@@ -2016,10 +2216,15 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
return true;
}
-static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
+static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id,
+ struct rdma_cm_event *event)
{
struct smb_direct_transport *t;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
struct task_struct *handler;
+ u8 peer_initiator_depth;
+ u8 peer_responder_resources;
int ret;
if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
@@ -2032,8 +2237,71 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
t = alloc_transport(new_cm_id);
if (!t)
return -ENOMEM;
+ sc = &t->socket;
+ sp = &sc->parameters;
+
+ peer_initiator_depth = event->param.conn.initiator_depth;
+ peer_responder_resources = event->param.conn.responder_resources;
+ if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) &&
+ event->param.conn.private_data_len == 8) {
+ /*
+ * Legacy clients with only iWarp MPA v1 support
+ * need a private blob in order to negotiate
+ * the IRD/ORD values.
+ */
+ const __be32 *ird_ord_hdr = event->param.conn.private_data;
+ u32 ird32 = be32_to_cpu(ird_ord_hdr[0]);
+ u32 ord32 = be32_to_cpu(ird_ord_hdr[1]);
+
+ /*
+ * cifs.ko sends the legacy IRD/ORD negotiation
+ * event if iWarp MPA v2 was used.
+ *
+ * Here we check that the values match and only
+ * mark the client as legacy if they don't match.
+ */
+ if ((u32)event->param.conn.initiator_depth != ird32 ||
+ (u32)event->param.conn.responder_resources != ord32) {
+ /*
+ * There are broken clients (old cifs.ko)
+ * using little endian and also
+ * struct rdma_conn_param only uses u8
+ * for initiator_depth and responder_resources,
+ * so we truncate the value to U8_MAX.
+ *
+ * smb_direct_accept_client() will then
+ * do the real negotiation in order to
+ * select the minimum between client and
+ * server.
+ */
+ ird32 = min_t(u32, ird32, U8_MAX);
+ ord32 = min_t(u32, ord32, U8_MAX);
+
+ sc->rdma.legacy_iwarp = true;
+ peer_initiator_depth = (u8)ird32;
+ peer_responder_resources = (u8)ord32;
+ }
+ }
+
+ /*
+ * First set what the we as server are able to support
+ */
+ sp->initiator_depth = min_t(u8, sp->initiator_depth,
+ new_cm_id->device->attrs.max_qp_rd_atom);
- ret = smb_direct_connect(t);
+ /*
+ * negotiate the value by using the minimum
+ * between client and server if the client provided
+ * non 0 values.
+ */
+ if (peer_initiator_depth != 0)
+ sp->initiator_depth = min_t(u8, sp->initiator_depth,
+ peer_initiator_depth);
+ if (peer_responder_resources != 0)
+ sp->responder_resources = min_t(u8, sp->responder_resources,
+ peer_responder_resources);
+
+ ret = smb_direct_connect(sc);
if (ret)
goto out_err;
@@ -2057,7 +2325,7 @@ static int smb_direct_listen_handler(struct rdma_cm_id *cm_id,
{
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST: {
- int ret = smb_direct_handle_connect_request(cm_id);
+ int ret = smb_direct_handle_connect_request(cm_id, event);
if (ret) {
pr_err("Can't create transport: %d\n", ret);
@@ -2177,7 +2445,8 @@ int ksmbd_rdma_init(void)
* for lack of credits
*/
smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!smb_direct_wq)
return -ENOMEM;
diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h
index a2291b77488a..3f93c6a9f7e4 100644
--- a/fs/smb/server/transport_rdma.h
+++ b/fs/smb/server/transport_rdma.h
@@ -11,61 +11,20 @@
#define SMBD_MIN_IOSIZE (512 * 1024)
#define SMBD_MAX_IOSIZE (16 * 1024 * 1024)
-/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
-struct smb_direct_negotiate_req {
- __le16 min_version;
- __le16 max_version;
- __le16 reserved;
- __le16 credits_requested;
- __le32 preferred_send_size;
- __le32 max_receive_size;
- __le32 max_fragmented_size;
-} __packed;
-
-/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */
-struct smb_direct_negotiate_resp {
- __le16 min_version;
- __le16 max_version;
- __le16 negotiated_version;
- __le16 reserved;
- __le16 credits_requested;
- __le16 credits_granted;
- __le32 status;
- __le32 max_readwrite_size;
- __le32 preferred_send_size;
- __le32 max_receive_size;
- __le32 max_fragmented_size;
-} __packed;
-
-#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001
-
-/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */
-struct smb_direct_data_transfer {
- __le16 credits_requested;
- __le16 credits_granted;
- __le16 flags;
- __le16 reserved;
- __le32 remaining_data_length;
- __le32 data_offset;
- __le32 data_length;
- __le32 padding;
- __u8 buffer[];
-} __packed;
-
#ifdef CONFIG_SMB_SERVER_SMBDIRECT
int ksmbd_rdma_init(void);
void ksmbd_rdma_stop_listening(void);
void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
void init_smbd_max_io_size(unsigned int sz);
-unsigned int get_smbd_max_read_write_size(void);
+unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
static inline void ksmbd_rdma_stop_listening(void) { }
static inline void ksmbd_rdma_destroy(void) { }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
static inline void init_smbd_max_io_size(unsigned int sz) { }
-static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
+static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; }
#endif
#endif /* __KSMBD_TRANSPORT_RDMA_H__ */
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 04539037108c..1cfa688904b2 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
pr_err("File(%s): creation failed (err:%d)\n", name, err);
}
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return err;
}
@@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
if (!err && dentry != d)
ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry));
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
if (err)
pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
return err;
@@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
ksmbd_debug(VFS, "vfs_link failed err %d\n", err);
out3:
- done_path_create(&newpath, dentry);
+ end_creating_path(&newpath, dentry);
out2:
path_put(&oldpath);
out1:
@@ -770,10 +770,9 @@ retry:
goto out4;
}
- rd.old_mnt_idmap = mnt_idmap(old_path->mnt),
+ rd.mnt_idmap = mnt_idmap(old_path->mnt),
rd.old_parent = old_parent,
rd.old_dentry = old_child,
- rd.new_mnt_idmap = mnt_idmap(new_path.mnt),
rd.new_parent = new_path.dentry,
rd.new_dentry = new_dentry,
rd.flags = flags,
@@ -1326,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
if (!abs_name)
return ERR_PTR(-ENOMEM);
- dent = kern_path_create(AT_FDCWD, abs_name, path, flags);
+ dent = start_creating_path(AT_FDCWD, abs_name, path, flags);
kfree(abs_name);
return dent;
}
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 0708155b5caf..78b506c5ef03 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -112,6 +112,8 @@ struct ksmbd_file {
bool is_durable;
bool is_persistent;
bool is_resilient;
+
+ bool is_posix_ctxt;
};
static inline void set_ctx_actor(struct dir_context *ctx,
diff --git a/fs/super.c b/fs/super.c
index 7f876f32343a..f4fa0e93c463 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc,
}
EXPORT_SYMBOL(get_tree_bdev);
-static int test_bdev_super(struct super_block *s, void *data)
-{
- return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
-}
-
-struct dentry *mount_bdev(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- struct super_block *s;
- int error;
- dev_t dev;
-
- error = lookup_bdev(dev_name, &dev);
- if (error)
- return ERR_PTR(error);
-
- flags |= SB_NOSEC;
- s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
- if (IS_ERR(s))
- return ERR_CAST(s);
-
- if (s->s_root) {
- if ((flags ^ s->s_flags) & SB_RDONLY) {
- deactivate_locked_super(s);
- return ERR_PTR(-EBUSY);
- }
- } else {
- error = setup_bdev_super(s, flags, NULL);
- if (!error)
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- s->s_flags |= SB_ACTIVE;
- }
-
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_bdev);
-
void kill_block_super(struct super_block *sb)
{
struct block_device *bdev = sb->s_bdev;
@@ -1773,26 +1730,6 @@ void kill_block_super(struct super_block *sb)
EXPORT_SYMBOL(kill_block_super);
#endif
-struct dentry *mount_nodev(struct file_system_type *fs_type,
- int flags, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- int error;
- struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-
- if (IS_ERR(s))
- return ERR_CAST(s);
-
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- s->s_flags |= SB_ACTIVE;
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_nodev);
-
/**
* vfs_get_tree - Get the mountable root
* @fc: The superblock configuration context.
@@ -2314,17 +2251,20 @@ int sb_init_dio_done_wq(struct super_block *sb)
{
struct workqueue_struct *old;
struct workqueue_struct *wq = alloc_workqueue("dio/%s",
- WQ_MEM_RECLAIM, 0,
+ WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0,
sb->s_id);
if (!wq)
return -ENOMEM;
+
+ old = NULL;
/*
* This has to be atomic as more DIOs can race to create the workqueue
*/
- old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
- /* Someone created workqueue before us? Free ours... */
- if (old)
+ if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) {
+ /* Someone created workqueue before us? Free ours... */
destroy_workqueue(wq);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index fb5ac358077b..0b14d004a095 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
}
const struct fscrypt_operations ubifs_crypt_operations = {
+ .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) -
+ (int)offsetof(struct ubifs_inode, vfs_inode),
.legacy_key_prefix = "ubifs:",
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f3e3b2068608..46952a33c4e6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
static int ubifs_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
@@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode)
goto out;
dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
- ubifs_assert(c, !atomic_read(&inode->i_count));
+ ubifs_assert(c, !icount_read(inode));
truncate_inode_pages_final(&inode->i_data);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5db45c9e26ee..49e50431741c 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb {
* @read_in_a_row: number of consecutive pages read in a row (for bulk read)
* @data_len: length of the data attached to the inode
* @data: inode's data
+ * @i_crypt_info: inode's fscrypt information
*
* @ui_mutex exists for two main reasons. At first it prevents inodes from
* being written back while UBIFS changing them, being in the middle of an VFS
@@ -416,6 +417,9 @@ struct ubifs_inode {
pgoff_t read_in_a_row;
int data_len;
void *data;
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+#endif
};
/**
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 503268cf4296..95ec42b84797 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -19,8 +19,7 @@ struct block_buffer {
};
/* Hash a block, writing the result to the next level's pending block buffer. */
-static int hash_one_block(struct inode *inode,
- const struct merkle_tree_params *params,
+static int hash_one_block(const struct merkle_tree_params *params,
struct block_buffer *cur)
{
struct block_buffer *next = cur + 1;
@@ -36,8 +35,7 @@ static int hash_one_block(struct inode *inode,
/* Zero-pad the block if it's shorter than the block size. */
memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
- fsverity_hash_block(params, inode, cur->data,
- &next->data[next->filled]);
+ fsverity_hash_block(params, cur->data, &next->data[next->filled]);
next->filled += params->digest_size;
cur->filled = 0;
return 0;
@@ -123,7 +121,7 @@ static int build_merkle_tree(struct file *filp,
fsverity_err(inode, "Short read of file data");
goto out;
}
- err = hash_one_block(inode, params, &buffers[-1]);
+ err = hash_one_block(params, &buffers[-1]);
if (err)
goto out;
for (level = 0; level < num_levels; level++) {
@@ -134,7 +132,7 @@ static int build_merkle_tree(struct file *filp,
}
/* Next block at @level is full */
- err = hash_one_block(inode, params, &buffers[level]);
+ err = hash_one_block(params, &buffers[level]);
if (err)
goto out;
err = write_merkle_tree_block(inode,
@@ -154,7 +152,7 @@ static int build_merkle_tree(struct file *filp,
/* Finish all nonempty pending tree blocks. */
for (level = 0; level < num_levels; level++) {
if (buffers[level].filled != 0) {
- err = hash_one_block(inode, params, &buffers[level]);
+ err = hash_one_block(params, &buffers[level]);
if (err)
goto out;
err = write_merkle_tree_block(inode,
@@ -284,9 +282,9 @@ static int enable_verity(struct file *filp,
/* Successfully enabled verity */
/*
- * Readers can start using ->i_verity_info immediately, so it
- * can't be rolled back once set. So don't set it until just
- * after the filesystem has successfully enabled verity.
+ * Readers can start using the inode's verity info immediately,
+ * so it can't be rolled back once set. So don't set it until
+ * just after the filesystem has successfully enabled verity.
*/
fsverity_set_info(inode, vi);
}
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 5fe854a5b9ad..dd20b138d452 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -63,10 +63,11 @@ struct merkle_tree_params {
* fsverity_info - cached verity metadata for an inode
*
* When a verity file is first opened, an instance of this struct is allocated
- * and stored in ->i_verity_info; it remains until the inode is evicted. It
- * caches information about the Merkle tree that's needed to efficiently verify
- * data read from the file. It also caches the file digest. The Merkle tree
- * pages themselves are not cached here, but the filesystem may cache them.
+ * and a pointer to it is stored in the file's in-memory inode. It remains
+ * until the inode is evicted. It caches information about the Merkle tree
+ * that's needed to efficiently verify data read from the file. It also caches
+ * the file digest. The Merkle tree pages themselves are not cached here, but
+ * the filesystem may cache them.
*/
struct fsverity_info {
struct merkle_tree_params tree_params;
@@ -89,7 +90,7 @@ union fsverity_hash_ctx *
fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
const u8 *salt, size_t salt_size);
void fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, const void *data, u8 *out);
+ const void *data, u8 *out);
void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
const void *data, size_t size, u8 *out);
void __init fsverity_check_hash_algs(void);
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 9bb3c6344907..de53e14c8aa7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -94,7 +94,6 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
/**
* fsverity_hash_block() - hash a single data or hash block
* @params: the Merkle tree's parameters
- * @inode: inode for which the hashing is being done
* @data: virtual address of a buffer containing the block to hash
* @out: output digest, size 'params->digest_size' bytes
*
@@ -102,7 +101,7 @@ fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
* in the Merkle tree parameters.
*/
void fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, const void *data, u8 *out)
+ const void *data, u8 *out)
{
union fsverity_hash_ctx ctx;
diff --git a/fs/verity/open.c b/fs/verity/open.c
index c561e130cd0c..77b1c977af02 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -244,17 +244,17 @@ fail:
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
{
/*
- * Multiple tasks may race to set ->i_verity_info, so use
- * cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fsverity_get_info(). I.e., here we publish ->i_verity_info with a
- * RELEASE barrier so that other tasks can ACQUIRE it.
+ * Multiple tasks may race to set the inode's verity info pointer, so
+ * use cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * fsverity_get_info(). I.e., publish the pointer with a RELEASE
+ * barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
- /* Lost the race, so free the fsverity_info we allocated. */
+ if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) {
+ /* Lost the race, so free the verity info we allocated. */
fsverity_free_info(vi);
/*
- * Afterwards, the caller may access ->i_verity_info directly,
- * so make sure to ACQUIRE the winning fsverity_info.
+ * Afterwards, the caller may access the inode's verity info
+ * directly, so make sure to ACQUIRE the winning verity info.
*/
(void)fsverity_get_info(inode);
}
@@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode,
return 0;
}
-/* Ensure the inode has an ->i_verity_info */
static int ensure_verity_info(struct inode *inode)
{
struct fsverity_info *vi = fsverity_get_info(inode);
@@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr);
void __fsverity_cleanup_inode(struct inode *inode)
{
- fsverity_free_info(inode->i_verity_info);
- inode->i_verity_info = NULL;
+ struct fsverity_info **vi_addr = fsverity_info_addr(inode);
+
+ fsverity_free_info(*vi_addr);
+ *vi_addr = NULL;
}
EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index a1f00c3fd3b2..86067c8b40cf 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -10,6 +10,31 @@
#include <linux/bio.h>
#include <linux/export.h>
+#define FS_VERITY_MAX_PENDING_BLOCKS 2
+
+struct fsverity_pending_block {
+ const void *data;
+ u64 pos;
+ u8 real_hash[FS_VERITY_MAX_DIGEST_SIZE];
+};
+
+struct fsverity_verification_context {
+ struct inode *inode;
+ struct fsverity_info *vi;
+ unsigned long max_ra_pages;
+
+ /*
+ * This is the queue of data blocks that are pending verification. When
+ * the crypto layer supports interleaved hashing, we allow multiple
+ * blocks to be queued up in order to utilize it. This can improve
+ * performance significantly vs. sequential hashing of each block.
+ */
+ int num_pending;
+ int max_pending;
+ struct fsverity_pending_block
+ pending_blocks[FS_VERITY_MAX_PENDING_BLOCKS];
+};
+
static struct workqueue_struct *fsverity_read_workqueue;
/*
@@ -79,7 +104,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
}
/*
- * Verify a single data block against the file's Merkle tree.
+ * Verify the hash of a single data block against the file's Merkle tree.
*
* In principle, we need to verify the entire path to the root node. However,
* for efficiency the filesystem may cache the hash blocks. Therefore we need
@@ -88,10 +113,11 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
*
* Return: %true if the data block is valid, else %false.
*/
-static bool
-verify_data_block(struct inode *inode, struct fsverity_info *vi,
- const void *data, u64 data_pos, unsigned long max_ra_pages)
+static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
+ const struct fsverity_pending_block *dblock,
+ unsigned long max_ra_pages)
{
+ const u64 data_pos = dblock->pos;
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int hsize = params->digest_size;
int level;
@@ -115,8 +141,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
*/
u64 hidx = data_pos >> params->log_blocksize;
- /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */
- BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX);
+ /*
+ * Up to FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS pages may
+ * be mapped at once.
+ */
+ static_assert(FS_VERITY_MAX_PENDING_BLOCKS + FS_VERITY_MAX_LEVELS <=
+ KM_MAX_IDX);
if (unlikely(data_pos >= inode->i_size)) {
/*
@@ -127,7 +157,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi,
* any part past EOF should be all zeroes. Therefore, we need
* to verify that any data blocks fully past EOF are all zeroes.
*/
- if (memchr_inv(data, 0, params->block_size)) {
+ if (memchr_inv(dblock->data, 0, params->block_size)) {
fsverity_err(inode,
"FILE CORRUPTED! Data past EOF is not zeroed");
return false;
@@ -202,7 +232,7 @@ descend:
unsigned long hblock_idx = hblocks[level - 1].index;
unsigned int hoffset = hblocks[level - 1].hoffset;
- fsverity_hash_block(params, inode, haddr, real_hash);
+ fsverity_hash_block(params, haddr, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
goto corrupted;
/*
@@ -220,18 +250,18 @@ descend:
put_page(hpage);
}
- /* Finally, verify the data block. */
- fsverity_hash_block(params, inode, data, real_hash);
- if (memcmp(want_hash, real_hash, hsize) != 0)
+ /* Finally, verify the hash of the data block. */
+ if (memcmp(want_hash, dblock->real_hash, hsize) != 0)
goto corrupted;
return true;
corrupted:
- fsverity_err(inode,
- "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
- data_pos, level - 1,
- params->hash_alg->name, hsize, want_hash,
- params->hash_alg->name, hsize, real_hash);
+ fsverity_err(
+ inode,
+ "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN",
+ data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
+ params->hash_alg->name, hsize,
+ level == 0 ? dblock->real_hash : real_hash);
error:
for (; level > 0; level--) {
kunmap_local(hblocks[level - 1].addr);
@@ -240,13 +270,73 @@ error:
return false;
}
+static void
+fsverity_init_verification_context(struct fsverity_verification_context *ctx,
+ struct inode *inode,
+ unsigned long max_ra_pages)
+{
+ struct fsverity_info *vi = *fsverity_info_addr(inode);
+
+ ctx->inode = inode;
+ ctx->vi = vi;
+ ctx->max_ra_pages = max_ra_pages;
+ ctx->num_pending = 0;
+ if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
+ sha256_finup_2x_is_optimized())
+ ctx->max_pending = 2;
+ else
+ ctx->max_pending = 1;
+}
+
+static void
+fsverity_clear_pending_blocks(struct fsverity_verification_context *ctx)
+{
+ int i;
+
+ for (i = ctx->num_pending - 1; i >= 0; i--) {
+ kunmap_local(ctx->pending_blocks[i].data);
+ ctx->pending_blocks[i].data = NULL;
+ }
+ ctx->num_pending = 0;
+}
+
static bool
-verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
- unsigned long max_ra_pages)
+fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
{
- struct inode *inode = data_folio->mapping->host;
- struct fsverity_info *vi = inode->i_verity_info;
- const unsigned int block_size = vi->tree_params.block_size;
+ struct fsverity_info *vi = ctx->vi;
+ const struct merkle_tree_params *params = &vi->tree_params;
+ int i;
+
+ if (ctx->num_pending == 2) {
+ /* num_pending == 2 implies that the algorithm is SHA-256 */
+ sha256_finup_2x(params->hashstate ? &params->hashstate->sha256 :
+ NULL,
+ ctx->pending_blocks[0].data,
+ ctx->pending_blocks[1].data, params->block_size,
+ ctx->pending_blocks[0].real_hash,
+ ctx->pending_blocks[1].real_hash);
+ } else {
+ for (i = 0; i < ctx->num_pending; i++)
+ fsverity_hash_block(params, ctx->pending_blocks[i].data,
+ ctx->pending_blocks[i].real_hash);
+ }
+
+ for (i = 0; i < ctx->num_pending; i++) {
+ if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
+ ctx->max_ra_pages))
+ return false;
+ }
+ fsverity_clear_pending_blocks(ctx);
+ return true;
+}
+
+static bool fsverity_add_data_blocks(struct fsverity_verification_context *ctx,
+ struct folio *data_folio, size_t len,
+ size_t offset)
+{
+ struct fsverity_info *vi = ctx->vi;
+ const struct merkle_tree_params *params = &vi->tree_params;
+ const unsigned int block_size = params->block_size;
u64 pos = (u64)data_folio->index << PAGE_SHIFT;
if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offset, block_size)))
@@ -255,14 +345,11 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
folio_test_uptodate(data_folio)))
return false;
do {
- void *data;
- bool valid;
-
- data = kmap_local_folio(data_folio, offset);
- valid = verify_data_block(inode, vi, data, pos + offset,
- max_ra_pages);
- kunmap_local(data);
- if (!valid)
+ ctx->pending_blocks[ctx->num_pending].data =
+ kmap_local_folio(data_folio, offset);
+ ctx->pending_blocks[ctx->num_pending].pos = pos + offset;
+ if (++ctx->num_pending == ctx->max_pending &&
+ !fsverity_verify_pending_blocks(ctx))
return false;
offset += block_size;
len -= block_size;
@@ -284,7 +371,15 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
*/
bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
{
- return verify_data_blocks(folio, len, offset, 0);
+ struct fsverity_verification_context ctx;
+
+ fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
+
+ if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
+ fsverity_verify_pending_blocks(&ctx))
+ return true;
+ fsverity_clear_pending_blocks(&ctx);
+ return false;
}
EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
@@ -305,6 +400,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks);
*/
void fsverity_verify_bio(struct bio *bio)
{
+ struct inode *inode = bio_first_folio_all(bio)->mapping->host;
+ struct fsverity_verification_context ctx;
struct folio_iter fi;
unsigned long max_ra_pages = 0;
@@ -321,13 +418,21 @@ void fsverity_verify_bio(struct bio *bio)
max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
}
+ fsverity_init_verification_context(&ctx, inode, max_ra_pages);
+
bio_for_each_folio_all(fi, bio) {
- if (!verify_data_blocks(fi.folio, fi.length, fi.offset,
- max_ra_pages)) {
- bio->bi_status = BLK_STS_IOERR;
- break;
- }
+ if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
+ fi.offset))
+ goto ioerr;
}
+
+ if (!fsverity_verify_pending_blocks(&ctx))
+ goto ioerr;
+ return;
+
+ioerr:
+ fsverity_clear_pending_blocks(&ctx);
+ bio->bi_status = BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(fsverity_verify_bio);
#endif /* CONFIG_BLOCK */
@@ -355,7 +460,7 @@ void __init fsverity_init_workqueue(void)
* latency on ARM64.
*/
fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
- WQ_HIGHPRI,
+ WQ_HIGHPRI | WQ_PERCPU,
num_online_cpus());
if (!fsverity_read_workqueue)
panic("failed to allocate fsverity_read_queue");
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 065953475cf5..8930d5254e1d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -25,7 +25,7 @@ config XFS_FS
config XFS_SUPPORT_V4
bool "Support deprecated V4 (crc=0) format"
depends on XFS_FS
- default y
+ default n
help
The V4 filesystem format lacks certain features that are supported
by the V5 format, such as metadata checksumming, strengthened
@@ -40,7 +40,7 @@ config XFS_SUPPORT_V4
filesystem is a V4 filesystem. If no such string is found, please
upgrade xfsprogs to the latest version and try again.
- This option will become default N in September 2025. Support for the
+ This option became default N in September 2025. Support for the
V4 format will be removed entirely in September 2030. Distributors
can say N here to withdraw support earlier.
@@ -50,7 +50,7 @@ config XFS_SUPPORT_V4
config XFS_SUPPORT_ASCII_CI
bool "Support deprecated case-insensitive ascii (ascii-ci=1) format"
depends on XFS_FS
- default y
+ default n
help
The ASCII case insensitivity filesystem feature only works correctly
on systems that have been coerced into using ISO 8859-1, and it does
@@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI
filesystem is a case-insensitive filesystem. If no such string is
found, please upgrade xfsprogs to the latest version and try again.
- This option will become default N in September 2025. Support for the
+ This option became default N in September 2025. Support for the
feature will be removed entirely in September 2030. Distributors
can say N here to withdraw support earlier.
@@ -137,7 +137,7 @@ config XFS_BTREE_IN_MEM
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
- default n
+ default y
depends on XFS_FS
depends on TMPFS && SHMEM
select XFS_LIVE_HOOKS
@@ -150,12 +150,8 @@ config XFS_ONLINE_SCRUB
advantage here is to look for problems proactively so that
they can be dealt with in a controlled manner.
- This feature is considered EXPERIMENTAL. Use with caution!
-
See the xfs_scrub man page in section 8 for additional information.
- If unsure, say N.
-
config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
@@ -171,11 +167,9 @@ config XFS_ONLINE_SCRUB_STATS
Usage data are collected in /sys/kernel/debug/xfs/scrub.
- If unsure, say N.
-
config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
- default n
+ default y
depends on XFS_FS && XFS_ONLINE_SCRUB
select XFS_BTREE_IN_MEM
help
@@ -186,12 +180,8 @@ config XFS_ONLINE_REPAIR
formatted with secondary metadata, such as reverse mappings and inode
parent pointers.
- This feature is considered EXPERIMENTAL. Use with caution!
-
See the xfs_scrub man page in section 8 for additional information.
- If unsure, say N.
-
config XFS_WARN
bool "XFS Verbose Warnings"
depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fb79215a509d..8ac8230c3d3c 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -92,9 +92,8 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return XFS_TEST_ERROR(avail < orig / 10 ||
- avail < mp->m_agbtree_maxlevels,
- mp, XFS_ERRTAG_AG_RESV_CRITICAL);
+ return avail < orig / 10 || avail < mp->m_agbtree_maxlevels ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL);
}
/*
@@ -203,7 +202,7 @@ __xfs_ag_resv_init(
return -EINVAL;
}
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL))
error = -ENOSPC;
else
error = xfs_dec_fdblocks(mp, hidden_space, true);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 000cc7f4a3ce..ad381c73abc4 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3321,7 +3321,7 @@ xfs_agf_read_verify(
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_agf_verify(bp);
- if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+ if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF))
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
@@ -4019,8 +4019,7 @@ __xfs_free_extent(
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_FREE_EXTENT))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
return -EIO;
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index fddb55605e0c..91c1b30ebaab 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit(
/*
* For attr2 we can try to move the forkoff if there is space in the
- * literal area, but for the old format we are done if there is no
- * space in the fixed attribute fork.
+ * literal area
*/
- if (!xfs_has_attr2(mp))
- return 0;
-
dsize = dp->i_df.if_bytes;
switch (dp->i_df.if_format) {
@@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit(
}
/*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
- * - noattr2 mount option is set,
- * - on-disk version bit says it is already set, or
- * - the attr2 mount option is not set to enable automatic upgrade from attr1.
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless
+ * on-disk version bit says it is already set
*/
STATIC void
xfs_sbversion_add_attr2(
struct xfs_mount *mp,
struct xfs_trans *tp)
{
- if (xfs_has_noattr2(mp))
- return;
if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
return;
- if (!xfs_has_attr2(mp))
- return;
spin_lock(&mp->m_sb_lock);
xfs_add_attr2(mp);
@@ -889,7 +879,7 @@ xfs_attr_sf_removename(
/*
* Fix up the start offset of the attribute fork
*/
- if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
+ if (totsize == sizeof(struct xfs_attr_sf_hdr) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) &&
!xfs_has_parent(mp)) {
@@ -900,7 +890,6 @@ xfs_attr_sf_removename(
ASSERT(dp->i_forkoff);
ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
- !xfs_has_attr2(mp) ||
dp->i_df.if_format == XFS_DINODE_FMT_BTREE ||
xfs_has_parent(mp));
xfs_trans_log_inode(args->trans, dp,
@@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit(
bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
be16_to_cpu(name_loc->valuelen));
}
- if (xfs_has_attr2(dp->i_mount) &&
- (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+ if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
return -1;
return xfs_attr_shortform_bytesfit(dp, bytes);
@@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform(
* this case.
*/
if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
- ASSERT(xfs_has_attr2(dp->i_mount));
ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
xfs_attr_fork_remove(dp, args->trans);
}
@@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
error = -EIO;
goto out;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d954f9b8071f..53ef4b7e504d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local(
static int
xfs_bmap_set_attrforkoff(
struct xfs_inode *ip,
- int size,
- int *version)
+ int size)
{
int default_size = xfs_default_attroffset(ip) >> 3;
@@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff(
ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
if (!ip->i_forkoff)
ip->i_forkoff = default_size;
- else if (xfs_has_attr2(ip->i_mount) && version)
- *version = 2;
break;
default:
ASSERT(0);
@@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork(
int rsvd) /* xact may use reserved blks */
{
struct xfs_mount *mp = tp->t_mountp;
- int version = 1; /* superblock attr version */
int logflags; /* logging flags */
int error; /* error return value */
@@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork(
ASSERT(!xfs_inode_has_attr_fork(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_bmap_set_attrforkoff(ip, size, &version);
+ error = xfs_bmap_set_attrforkoff(ip, size);
if (error)
return error;
@@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork(
xfs_trans_log_inode(tp, ip, logflags);
if (error)
return error;
- if (!xfs_has_attr(mp) ||
- (!xfs_has_attr2(mp) && version == 2)) {
+ if (!xfs_has_attr(mp)) {
bool log_sb = false;
spin_lock(&mp->m_sb_lock);
if (!xfs_has_attr(mp)) {
xfs_add_attr(mp);
- log_sb = true;
- }
- if (!xfs_has_attr2(mp) && version == 2) {
xfs_add_attr2(mp);
log_sb = true;
}
@@ -3662,8 +3654,7 @@ xfs_bmap_btalloc(
/* Trim the allocation back to the maximum an AG can fit. */
args.maxlen = min(ap->length, mp->m_ag_max_usable);
- if (unlikely(XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
error = xfs_bmap_exact_minlen_extent_alloc(ap, &args);
else if ((ap->datatype & XFS_ALLOC_USERDATA) &&
xfs_inode_is_filestream(ap->ip))
@@ -3849,7 +3840,7 @@ xfs_bmapi_read(
}
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4200,7 +4191,7 @@ xfs_bmapi_write(
(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4545,7 +4536,7 @@ xfs_bmapi_remap(
(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -5679,7 +5670,7 @@ xfs_bmap_collapse_extents(
int logflags = 0;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -5795,7 +5786,7 @@ xfs_bmap_insert_extents(
int logflags = 0;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -5900,7 +5891,7 @@ xfs_bmap_split_extent(
int i = 0;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -6065,7 +6056,7 @@ xfs_bmap_finish_one(
trace_xfs_bmap_deferred(bi);
- if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
switch (bi->bi_type) {
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a61211d253f1..dbe9df8c3300 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -306,7 +306,7 @@ xfs_btree_check_block(
fa = __xfs_btree_check_block(cur, block, level, bp);
if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+ XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
xfs_btree_mark_sick(cur);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 723a0643b838..90f7fc219fcc 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -565,7 +565,7 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
- if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
return -EIO;
/*
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 1775abcfa04d..82a338458a51 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -223,7 +223,7 @@ xfs_dir_ino_validate(
bool ino_ok = xfs_verify_dir_ino(mp, ino);
if (XFS_IS_CORRUPT(mp, !ino_ok) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
return -EFSCORRUPTED;
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a53c5d40e084..de840abc0bcd 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -4,14 +4,22 @@
* Copyright (C) 2017 Oracle.
* All Rights Reserved.
*/
-#ifndef __XFS_ERRORTAG_H_
+#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG)
#define __XFS_ERRORTAG_H_
/*
- * error injection tags - the labels can be anything you want
- * but each tag should have its own unique number
+ * There are two ways to use this header file. The first way is to #include it
+ * bare, which will define all the XFS_ERRTAG_* error injection knobs for use
+ * with the XFS_TEST_ERROR macro. The second way is to enclose the #include
+ * with a #define for an XFS_ERRTAG macro, in which case the header will define
+ " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each
+ * defined error injection knob.
*/
+/*
+ * These are the actual error injection tags. The numbers should be consecutive
+ * because arrays are sized based on the maximum.
+ */
#define XFS_ERRTAG_NOERROR 0
#define XFS_ERRTAG_IFLUSH_1 1
#define XFS_ERRTAG_IFLUSH_2 2
@@ -71,49 +79,61 @@
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
*/
#define XFS_RANDOM_DEFAULT 100
-#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
-#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_FREE_EXTENT 1
-#define XFS_RANDOM_RMAP_FINISH_ONE 1
-#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
-#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
-#define XFS_RANDOM_BMAP_FINISH_ONE 1
-#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_LOG_BAD_CRC 1
-#define XFS_RANDOM_LOG_ITEM_PIN 1
-#define XFS_RANDOM_BUF_LRU_REF 2
-#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
-#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
-#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
-#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
-#define XFS_RANDOM_AG_RESV_FAIL 1
-#define XFS_RANDOM_LARP 1
-#define XFS_RANDOM_DA_LEAF_SPLIT 1
-#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
-#define XFS_RANDOM_WB_DELAY_MS 3000
-#define XFS_RANDOM_WRITE_DELAY_MS 3000
-#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1
-#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4
+
+/*
+ * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are:
+ * 1. The XFS_ERRTAG_ flag but without the prefix;
+ * 2. The name of the sysfs knob; and
+ * 3. The default value for the knob.
+ */
+#ifdef XFS_ERRTAG
+# undef XFS_ERRTAGS
+# define XFS_ERRTAGS \
+XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \
+XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \
+XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \
+XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \
+XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \
+XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \
+XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \
+XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \
+XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \
+XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \
+XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \
+XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \
+XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \
+XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \
+XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \
+XFS_ERRTAG(LARP, larp, 1) \
+XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \
+XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
+XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
+XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
+XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4)
+#endif /* XFS_ERRTAG */
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
index 3f1d6a98c118..932ee4619e9e 100644
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -616,7 +616,7 @@ xfs_exchmaps_finish_one(
return error;
}
- if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
+ if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
return -EIO;
/* If we still have work to do, ask for a new transaction. */
@@ -882,7 +882,7 @@ xmi_ensure_delta_nextents(
&new_nextents))
return -EFBIG;
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
new_nextents > 10)
return -EFBIG;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 750111634d9f..d97295eaebe6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2140,7 +2140,7 @@ xfs_difree_inobt(
* remove the chunk if the block size is large enough for multiple inode
* chunks (that might not be free).
*/
- if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
xic->deleted = true;
xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
@@ -2286,7 +2286,7 @@ xfs_difree_finobt(
* enough for multiple chunks. Leave the finobt record to remain in sync
* with the inobt.
*/
- if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
error = xfs_btree_delete(cur, &i);
if (error)
@@ -2706,7 +2706,7 @@ xfs_agi_read_verify(
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_agi_verify(bp);
- if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
+ if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI))
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index aa13fc00afd7..b1812b2c3cce 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -61,8 +61,8 @@ xfs_inode_buf_verify(
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
- XFS_ERRTAG_ITOBP_INOTOBP))) {
+ if (unlikely(!di_ok ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
xfs_buf_ioerror(bp, -EIO);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 4f99b90add55..1772d82f2d68 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -756,8 +756,7 @@ xfs_iext_count_extend(
if (nr_exts < ifp->if_nextents)
return -EFBIG;
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
- nr_exts > 10)
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10)
return -EFBIG;
if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) {
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 48fe49a5f050..309ce6dd5553 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -299,17 +299,6 @@ xfs_inode_init(
} else {
inode_init_owner(args->idmap, inode, dir, args->mode);
}
-
- /*
- * If the group ID of the new file does not match the effective
- * group ID or one of the supplementary group IDs, the S_ISGID
- * bit is cleared (and only if the irix_sgid_inherit
- * compatibility variable is set).
- */
- if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
- inode->i_mode &= ~S_ISGID;
-
ip->i_projid = xfs_get_initial_prid(pip);
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 0d637c276db0..6c50cb2ece19 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -86,43 +86,6 @@ struct xfs_unmount_log_format {
uint32_t pad2; /* may as well make it 64 bits */
};
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT 1
-#define XLOG_REG_TYPE_BCHUNK 2
-#define XLOG_REG_TYPE_EFI_FORMAT 3
-#define XLOG_REG_TYPE_EFD_FORMAT 4
-#define XLOG_REG_TYPE_IFORMAT 5
-#define XLOG_REG_TYPE_ICORE 6
-#define XLOG_REG_TYPE_IEXT 7
-#define XLOG_REG_TYPE_IBROOT 8
-#define XLOG_REG_TYPE_ILOCAL 9
-#define XLOG_REG_TYPE_IATTR_EXT 10
-#define XLOG_REG_TYPE_IATTR_BROOT 11
-#define XLOG_REG_TYPE_IATTR_LOCAL 12
-#define XLOG_REG_TYPE_QFORMAT 13
-#define XLOG_REG_TYPE_DQUOT 14
-#define XLOG_REG_TYPE_QUOTAOFF 15
-#define XLOG_REG_TYPE_LRHEADER 16
-#define XLOG_REG_TYPE_UNMOUNT 17
-#define XLOG_REG_TYPE_COMMIT 18
-#define XLOG_REG_TYPE_TRANSHDR 19
-#define XLOG_REG_TYPE_ICREATE 20
-#define XLOG_REG_TYPE_RUI_FORMAT 21
-#define XLOG_REG_TYPE_RUD_FORMAT 22
-#define XLOG_REG_TYPE_CUI_FORMAT 23
-#define XLOG_REG_TYPE_CUD_FORMAT 24
-#define XLOG_REG_TYPE_BUI_FORMAT 25
-#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_ATTRI_FORMAT 27
-#define XLOG_REG_TYPE_ATTRD_FORMAT 28
-#define XLOG_REG_TYPE_ATTR_NAME 29
-#define XLOG_REG_TYPE_ATTR_VALUE 30
-#define XLOG_REG_TYPE_XMI_FORMAT 31
-#define XLOG_REG_TYPE_XMD_FORMAT 32
-#define XLOG_REG_TYPE_ATTR_NEWNAME 33
-#define XLOG_REG_TYPE_ATTR_NEWVALUE 34
-#define XLOG_REG_TYPE_MAX 34
-
/*
* Flags to log operation header
*
@@ -141,14 +104,13 @@ struct xfs_unmount_log_format {
#define XLOG_END_TRANS 0x10 /* End a continued transaction */
#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
-
-typedef struct xlog_op_header {
+struct xlog_op_header {
__be32 oh_tid; /* transaction id of operation : 4 b */
__be32 oh_len; /* bytes in data region : 4 b */
__u8 oh_clientid; /* who sent me this : 1 b */
__u8 oh_flags; /* : 1 b */
__u16 oh_res2; /* 32 bit align : 2 b */
-} xlog_op_header_t;
+};
/* valid values for h_fmt */
#define XLOG_FMT_UNKNOWN 0
@@ -174,12 +136,40 @@ typedef struct xlog_rec_header {
__be32 h_prev_block; /* block number to previous LR : 4 */
__be32 h_num_logops; /* number of log operations in this LR : 4 */
__be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
- /* new fields */
+
+ /* fields added by the Linux port: */
__be32 h_fmt; /* format of log record : 4 */
uuid_t h_fs_uuid; /* uuid of FS : 16 */
+
+ /* fields added for log v2: */
__be32 h_size; /* iclog size : 4 */
+
+ /*
+ * When h_size added for log v2 support, it caused structure to have
+ * a different size on i386 vs all other architectures because the
+ * sum of the size ofthe member is not aligned by that of the largest
+ * __be64-sized member, and i386 has really odd struct alignment rules.
+ *
+ * Due to the way the log headers are placed out on-disk that alone is
+ * not a problem becaue the xlog_rec_header always sits alone in a
+ * BBSIZEs area, and the rest of that area is padded with zeroes.
+ * But xlog_cksum used to calculate the checksum based on the structure
+ * size, and thus gives different checksums for i386 vs the rest.
+ * We now do two checksum validation passes for both sizes to allow
+ * moving v5 file systems with unclean logs between i386 and other
+ * (little-endian) architectures.
+ */
+ __u32 h_pad0;
} xlog_rec_header_t;
+#ifdef __i386__
+#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size)
+#define XLOG_REC_SIZE_OTHER sizeof(struct xlog_rec_header)
+#else
+#define XLOG_REC_SIZE sizeof(struct xlog_rec_header)
+#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size)
+#endif /* __i386__ */
+
typedef struct xlog_rec_ext_header {
__be32 xh_cycle; /* write cycle of log : 4 */
__be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
@@ -195,12 +185,11 @@ typedef union xlog_in_core2 {
} xlog_in_core_2_t;
/* not an on-disk structure, but needed by log recovery in userspace */
-typedef struct xfs_log_iovec {
+struct xfs_log_iovec {
void *i_addr; /* beginning address of region */
int i_len; /* length in bytes of region */
uint i_type; /* type of region */
-} xfs_log_iovec_t;
-
+};
/*
* Transaction Header definitions.
@@ -213,12 +202,12 @@ typedef struct xfs_log_iovec {
* Do not change the below structure without redoing the code in
* xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
*/
-typedef struct xfs_trans_header {
+struct xfs_trans_header {
uint th_magic; /* magic number */
uint th_type; /* transaction type */
int32_t th_tid; /* transaction id (unused) */
uint th_num_items; /* num items logged by trans */
-} xfs_trans_header_t;
+};
#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
@@ -542,7 +531,7 @@ struct xfs_log_dinode {
#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1)
-typedef struct xfs_buf_log_format {
+struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
unsigned short blf_size; /* size of this item */
unsigned short blf_flags; /* misc state */
@@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format {
int64_t blf_blkno; /* starting blkno of this buf */
unsigned int blf_map_size; /* used size of data bitmap in words */
unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
+};
/*
* All buffers now need to tell recovery where the magic number
@@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
/*
* EFI/EFD log format definitions
*/
-typedef struct xfs_extent {
+struct xfs_extent {
xfs_fsblock_t ext_start;
xfs_extlen_t ext_len;
-} xfs_extent_t;
+};
/*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
+ * Since the structures in struct xfs_extent add up to 96 bytes, it has
+ * different alignments on i386 vs all other architectures, because i386
+ * does not pad structures to their natural alignment.
+ *
+ * Provide the different variants for use by a conversion routine.
*/
-typedef struct xfs_extent_32 {
+struct xfs_extent_32 {
uint64_t ext_start;
uint32_t ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
+} __attribute__((packed));
-typedef struct xfs_extent_64 {
+struct xfs_extent_64 {
uint64_t ext_start;
uint32_t ext_len;
uint32_t ext_pad;
-} xfs_extent_64_t;
+};
/*
* This is the structure used to lay out an efi log item in the
* log. The efi_extents field is a variable size array whose
* size is given by efi_nextents.
*/
-typedef struct xfs_efi_log_format {
+struct xfs_efi_log_format {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[]; /* array of extents to free */
-} xfs_efi_log_format_t;
+ struct xfs_extent efi_extents[]; /* array of extents to free */
+};
static inline size_t
xfs_efi_log_format_sizeof(
@@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof(
nr * sizeof(struct xfs_extent);
}
-typedef struct xfs_efi_log_format_32 {
+struct xfs_efi_log_format_32 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_32_t efi_extents[]; /* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
+ struct xfs_extent_32 efi_extents[]; /* array of extents to free */
+} __attribute__((packed));
static inline size_t
xfs_efi_log_format32_sizeof(
@@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof(
nr * sizeof(struct xfs_extent_32);
}
-typedef struct xfs_efi_log_format_64 {
+struct xfs_efi_log_format_64 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_64_t efi_extents[]; /* array of extents to free */
-} xfs_efi_log_format_64_t;
+ struct xfs_extent_64 efi_extents[]; /* array of extents to free */
+};
static inline size_t
xfs_efi_log_format64_sizeof(
@@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof(
* log. The efd_extents array is a variable size array whose
* size is given by efd_nextents;
*/
-typedef struct xfs_efd_log_format {
+struct xfs_efd_log_format {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[]; /* array of extents freed */
-} xfs_efd_log_format_t;
+ struct xfs_extent efd_extents[]; /* array of extents freed */
+};
static inline size_t
xfs_efd_log_format_sizeof(
@@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof(
nr * sizeof(struct xfs_extent);
}
-typedef struct xfs_efd_log_format_32 {
+struct xfs_efd_log_format_32 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_32_t efd_extents[]; /* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
+ struct xfs_extent_32 efd_extents[]; /* array of extents freed */
+} __attribute__((packed));
static inline size_t
xfs_efd_log_format32_sizeof(
@@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof(
nr * sizeof(struct xfs_extent_32);
}
-typedef struct xfs_efd_log_format_64 {
+struct xfs_efd_log_format_64 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_64_t efd_extents[]; /* array of extents freed */
-} xfs_efd_log_format_64_t;
+ struct xfs_extent_64 efd_extents[]; /* array of extents freed */
+};
static inline size_t
xfs_efd_log_format64_sizeof(
@@ -957,14 +947,14 @@ struct xfs_xmd_log_format {
* The first two fields must be the type and size fitting into
* 32 bits : log_recovery code assumes that.
*/
-typedef struct xfs_dq_logformat {
+struct xfs_dq_logformat {
uint16_t qlf_type; /* dquot log item type */
uint16_t qlf_size; /* size of this item */
xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
int64_t qlf_blkno; /* blkno of dquot buffer */
int32_t qlf_len; /* len of dquot buffer */
uint32_t qlf_boffset; /* off of dquot in buffer */
-} xfs_dq_logformat_t;
+};
/*
* log format struct for QUOTAOFF records.
@@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat {
* to the first and ensures that the first logitem is taken out of the AIL
* only when the last one is securely committed.
*/
-typedef struct xfs_qoff_logformat {
+struct xfs_qoff_logformat {
unsigned short qf_type; /* quotaoff log item type */
unsigned short qf_size; /* size of this item */
unsigned int qf_flags; /* USR and/or GRP */
char qf_pad[12]; /* padding for future */
-} xfs_qoff_logformat_t;
+};
/*
* Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 95de23095030..9e712e62369c 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -111,7 +111,7 @@ struct xlog_recover_item {
struct xlog_recover {
struct hlist_node r_list;
xlog_tid_t r_log_tid; /* log's transaction id */
- xfs_trans_header_t r_theader; /* trans header for partial */
+ struct xfs_trans_header r_theader; /* trans header for partial */
int r_state; /* not needed */
xfs_lsn_t r_lsn; /* xact lsn */
struct list_head r_itemq; /* q for items */
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index 225923e463c4..b02e3d6c0868 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -121,7 +121,7 @@ xfs_metafile_resv_critical(
div_u64(mp->m_metafile_resv_target, 10)))
return true;
- return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+ return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
}
/* Allocate a block from the metadata file's reservation. */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 5ed44fdf7491..7bfa3242e2c5 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -174,6 +174,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32);
XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 328);
+ XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 260);
XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16);
XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 897784037483..2484dc9f6d7e 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space(
* refcount continue update "error" has been injected.
*/
if (cur->bc_refc.nr_ops > 2 &&
- XFS_TEST_ERROR(false, cur->bc_mp,
- XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
+ XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
if (cur->bc_refc.nr_ops == 0)
@@ -1398,7 +1397,7 @@ xfs_refcount_finish_one(
trace_xfs_refcount_deferred(mp, ri);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
return -EIO;
/*
@@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one(
trace_xfs_refcount_deferred(mp, ri);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
return -EIO;
/*
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3cdf50563fec..83e0488ff773 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2690,7 +2690,7 @@ xfs_rmap_finish_one(
trace_xfs_rmap_deferred(mp, ri);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE))
return -EIO;
/*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5057536e586c..618061d898d4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1067,7 +1067,7 @@ xfs_rtfree_extent(
ASSERT(rbmip->i_itemp != NULL);
xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
return -EIO;
error = xfs_rtcheck_alloc_range(&args, start, len);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 711e180f9ebb..cdd16dd805d7 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -142,8 +142,6 @@ xfs_sb_version_to_features(
if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
features |= XFS_FEAT_LAZYSBCOUNT;
- if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
- features |= XFS_FEAT_ATTR2;
if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
features |= XFS_FEAT_PROJID32;
if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
@@ -155,7 +153,7 @@ xfs_sb_version_to_features(
/* Always on V5 features */
features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
- XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+ XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 |
XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
/* Optional V5 features */
@@ -1524,7 +1522,8 @@ xfs_fs_geometry(
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
XFS_FSOP_GEOM_FLAGS_DIRV2 |
- XFS_FSOP_GEOM_FLAGS_EXTFLG;
+ XFS_FSOP_GEOM_FLAGS_EXTFLG |
+ XFS_FSOP_GEOM_FLAGS_ATTR2;
if (xfs_has_attr(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
if (xfs_has_quota(mp))
@@ -1537,8 +1536,6 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
if (xfs_has_lazysbcount(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
- if (xfs_has_attr2(mp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
if (xfs_has_projid32(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
if (xfs_has_crc(mp))
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
index c4f1367b2cca..5fefd132e002 100644
--- a/fs/xfs/libxfs/xfs_zones.h
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -29,6 +29,13 @@ struct xfs_rtgroup;
#define XFS_OPEN_GC_ZONES 1U
#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+/*
+ * For zoned devices that do not have a limit on the number of open zones, and
+ * for regular devices using the zoned allocator, use the most common SMR disks
+ * limit (128) as the default limit on the number of open zones.
+ */
+#define XFS_DEFAULT_MAX_OPEN_ZONES 128
+
bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer);
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 38a246b8bf11..b2a83801412e 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -300,7 +300,7 @@ xrep_cow_find_bad(
* on the debugging knob, replace everything in the CoW fork.
*/
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
- XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
xc->irec.br_blockcount);
if (error)
@@ -385,7 +385,7 @@ xrep_cow_find_bad_rt(
* CoW fork and then scan for staging extents in the refcountbt.
*/
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
- XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
xc->irec.br_blockcount);
if (error)
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index 14939d7de349..378ec7c8d38e 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -79,7 +79,7 @@ xchk_metapath_cleanup(
if (mpath->dp_ilock_flags)
xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
- kfree(mpath->path);
+ kfree_const(mpath->path);
}
/* Set up a metadir path scan. @path must be dynamically allocated. */
@@ -98,13 +98,13 @@ xchk_setup_metapath_scan(
error = xchk_install_live_inode(sc, ip);
if (error) {
- kfree(path);
+ kfree_const(path);
return error;
}
mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
if (!mpath) {
- kfree(path);
+ kfree_const(path);
return -ENOMEM;
}
@@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir(
return -ENOENT;
return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
- kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip);
+ kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip);
}
/* Scan a rtgroup inode under the /rtgroups directory. */
@@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir(
return -ENOENT;
return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
- kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
+ kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip);
}
/* Scan a quota inode under the /quota directory. */
@@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode(
return -ENOENT;
return xchk_setup_metapath_scan(sc, qi->qi_dirip,
- kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
+ kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip);
}
#else
# define xchk_setup_metapath_quotadir(...) (-ENOENT)
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 1588ce971cb8..951ae8b71566 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -28,6 +28,15 @@
#include "scrub/newbt.h"
/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation. The newbt code should reserve
+ * exactly the correct number of blocks to rebuild the btree, so there should
+ * not be any excess blocks to free when committing a new btree.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS (128)
+
+/*
* Estimate proper slack values for a btree that's being reloaded.
*
* Under most circumstances, we'll take whatever default loading value the
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 8703897c0a9c..07f5bb8a6421 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -36,6 +36,12 @@
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_bmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -91,21 +97,33 @@
struct xreap_state {
struct xfs_scrub *sc;
- /* Reverse mapping owner and metadata reservation type. */
- const struct xfs_owner_info *oinfo;
- enum xfs_ag_resv_type resv;
+ union {
+ struct {
+ /*
+ * For AG blocks, this is reverse mapping owner and
+ * metadata reservation type.
+ */
+ const struct xfs_owner_info *oinfo;
+ enum xfs_ag_resv_type resv;
+ };
+ struct {
+ /* For file blocks, this is the inode and fork. */
+ struct xfs_inode *ip;
+ int whichfork;
+ };
+ };
- /* If true, roll the transaction before reaping the next extent. */
- bool force_roll;
+ /* Number of invalidated buffers logged to the current transaction. */
+ unsigned int nr_binval;
- /* Number of deferred reaps attached to the current transaction. */
- unsigned int deferred;
+ /* Maximum number of buffers we can invalidate in a single tx. */
+ unsigned int max_binval;
- /* Number of invalidated buffers logged to the current transaction. */
- unsigned int invalidated;
+ /* Number of deferred reaps attached to the current transaction. */
+ unsigned int nr_deferred;
- /* Number of deferred reaps queued during the whole reap sequence. */
- unsigned long long total_deferred;
+ /* Maximum number of intents we can reap in a single transaction. */
+ unsigned int max_deferred;
};
/* Put a block back on the AGFL. */
@@ -148,71 +166,79 @@ xreap_put_freelist(
}
/* Are there any uncommitted reap operations? */
-static inline bool xreap_dirty(const struct xreap_state *rs)
+static inline bool xreap_is_dirty(const struct xreap_state *rs)
{
- if (rs->force_roll)
- return true;
- if (rs->deferred)
- return true;
- if (rs->invalidated)
- return true;
- if (rs->total_deferred)
- return true;
- return false;
+ return rs->nr_binval > 0 || rs->nr_deferred > 0;
}
-#define XREAP_MAX_BINVAL (2048)
-
/*
- * Decide if we want to roll the transaction after reaping an extent. We don't
- * want to overrun the transaction reservation, so we prohibit more than
- * 128 EFIs per transaction. For the same reason, we limit the number
- * of buffer invalidations to 2048.
+ * Decide if we need to roll the transaction to clear out the the log
+ * reservation that we allocated to buffer invalidations.
*/
-static inline bool xreap_want_roll(const struct xreap_state *rs)
+static inline bool xreap_want_binval_roll(const struct xreap_state *rs)
{
- if (rs->force_roll)
- return true;
- if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
- return true;
- if (rs->invalidated > XREAP_MAX_BINVAL)
- return true;
- return false;
+ return rs->nr_binval >= rs->max_binval;
}
-static inline void xreap_reset(struct xreap_state *rs)
+/* Reset the buffer invalidation count after rolling. */
+static inline void xreap_binval_reset(struct xreap_state *rs)
{
- rs->total_deferred += rs->deferred;
- rs->deferred = 0;
- rs->invalidated = 0;
- rs->force_roll = false;
+ rs->nr_binval = 0;
}
-#define XREAP_MAX_DEFER_CHAIN (2048)
+/*
+ * Bump the number of invalidated buffers, and return true if we can continue,
+ * or false if we need to roll the transaction.
+ */
+static inline bool xreap_inc_binval(struct xreap_state *rs)
+{
+ rs->nr_binval++;
+ return rs->nr_binval < rs->max_binval;
+}
/*
* Decide if we want to finish the deferred ops that are attached to the scrub
* transaction. We don't want to queue huge chains of deferred ops because
* that can consume a lot of log space and kernel memory. Hence we trigger a
- * xfs_defer_finish if there are more than 2048 deferred reap operations or the
- * caller did some real work.
+ * xfs_defer_finish if there are too many deferred reap operations or we've run
+ * out of space for invalidations.
*/
-static inline bool
-xreap_want_defer_finish(const struct xreap_state *rs)
+static inline bool xreap_want_defer_finish(const struct xreap_state *rs)
{
- if (rs->force_roll)
- return true;
- if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
- return true;
- return false;
+ return rs->nr_deferred >= rs->max_deferred;
}
+/*
+ * Reset the defer chain length and buffer invalidation count after finishing
+ * items.
+ */
static inline void xreap_defer_finish_reset(struct xreap_state *rs)
{
- rs->total_deferred = 0;
- rs->deferred = 0;
- rs->invalidated = 0;
- rs->force_roll = false;
+ rs->nr_deferred = 0;
+ rs->nr_binval = 0;
+}
+
+/*
+ * Bump the number of deferred extent reaps.
+ */
+static inline void xreap_inc_defer(struct xreap_state *rs)
+{
+ rs->nr_deferred++;
+}
+
+/* Force the caller to finish a deferred item chain. */
+static inline void xreap_force_defer_finish(struct xreap_state *rs)
+{
+ rs->nr_deferred = rs->max_deferred;
+}
+
+/* Maximum number of fsblocks that we might find in a buffer to invalidate. */
+static inline unsigned int
+xrep_binval_max_fsblocks(
+ struct xfs_mount *mp)
+{
+ /* Remote xattr values are the largest buffers that we support. */
+ return xfs_attr3_max_rmt_blocks(mp);
}
/*
@@ -224,12 +250,8 @@ xrep_bufscan_max_sectors(
struct xfs_mount *mp,
xfs_extlen_t fsblocks)
{
- int max_fsbs;
-
- /* Remote xattr values are the largest buffers that we support. */
- max_fsbs = xfs_attr3_max_rmt_blocks(mp);
-
- return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+ return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks,
+ xrep_binval_max_fsblocks(mp)));
}
/*
@@ -297,14 +319,13 @@ xreap_agextent_binval(
while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
- rs->invalidated++;
/*
* Stop invalidating if we've hit the limit; we should
* still have enough reservation left to free however
* far we've gotten.
*/
- if (rs->invalidated > XREAP_MAX_BINVAL) {
+ if (!xreap_inc_binval(rs)) {
*aglenp -= agbno_next - bno;
goto out;
}
@@ -416,21 +437,23 @@ xreap_agextent_iter(
trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
*aglenp);
- rs->force_roll = true;
-
if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
/*
- * If we're unmapping CoW staging extents, remove the
+ * t0: Unmapping CoW staging extents, remove the
* records from the refcountbt, which will remove the
* rmap record as well.
*/
xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
*aglenp);
+ xreap_inc_defer(rs);
return 0;
}
- return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
- *aglenp, rs->oinfo);
+ /* t1: unmap crosslinked metadata blocks */
+ xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp,
+ rs->oinfo->oi_owner);
+ xreap_inc_defer(rs);
+ return 0;
}
trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
@@ -443,12 +466,12 @@ xreap_agextent_iter(
*/
xreap_agextent_binval(rs, agbno, aglenp);
if (*aglenp == 0) {
- ASSERT(xreap_want_roll(rs));
+ ASSERT(xreap_want_binval_roll(rs));
return 0;
}
/*
- * If we're getting rid of CoW staging extents, use deferred work items
+ * t2: To get rid of CoW staging extents, use deferred work items
* to remove the refcountbt records (which removes the rmap records)
* and free the extent. We're not worried about the system going down
* here because log recovery walks the refcount btree to clean out the
@@ -463,23 +486,23 @@ xreap_agextent_iter(
if (error)
return error;
- rs->force_roll = true;
+ xreap_inc_defer(rs);
return 0;
}
- /* Put blocks back on the AGFL one at a time. */
+ /* t3: Put blocks back on the AGFL one at a time. */
if (rs->resv == XFS_AG_RESV_AGFL) {
ASSERT(*aglenp == 1);
error = xreap_put_freelist(sc, agbno);
if (error)
return error;
- rs->force_roll = true;
+ xreap_force_defer_finish(rs);
return 0;
}
/*
- * Use deferred frees to get rid of the old btree blocks to try to
+ * t4: Use deferred frees to get rid of the old btree blocks to try to
* minimize the window in which we could crash and lose the old blocks.
* Add a defer ops barrier every other extent to avoid stressing the
* system with large EFIs.
@@ -489,12 +512,194 @@ xreap_agextent_iter(
if (error)
return error;
- rs->deferred++;
- if (rs->deferred % 2 == 0)
+ xreap_inc_defer(rs);
+ if (rs->nr_deferred % 2 == 0)
xfs_defer_add_barrier(sc->tp);
return 0;
}
+/* Configure the deferral and invalidation limits */
+static inline void
+xreap_configure_limits(
+ struct xreap_state *rs,
+ unsigned int fixed_overhead,
+ unsigned int variable_overhead,
+ unsigned int per_intent,
+ unsigned int per_binval)
+{
+ struct xfs_scrub *sc = rs->sc;
+ unsigned int res = sc->tp->t_log_res - fixed_overhead;
+
+ /* Don't underflow the reservation */
+ if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) {
+ ASSERT(sc->tp->t_log_res >=
+ (fixed_overhead + variable_overhead));
+ xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE);
+ return;
+ }
+
+ rs->max_deferred = per_intent ? res / variable_overhead : 0;
+ res -= rs->max_deferred * per_intent;
+ rs->max_binval = per_binval ? res / per_binval : 0;
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single per-AG space extent. This is not for freeing CoW
+ * staging extents.
+ */
+STATIC void
+xreap_configure_agextent_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+
+ /*
+ * Various things can happen when reaping non-CoW metadata blocks:
+ *
+ * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap
+ * record.
+ *
+ * t3: Freeing to AGFL: roll and finish deferred items for every block.
+ * Limits here do not matter.
+ *
+ * t4: Freeing metadata blocks: deferred freeing of the space, which
+ * also removes the rmap record.
+ *
+ * For simplicity, we'll use the worst-case intents size to determine
+ * the maximum number of deferred extents before we have to finish the
+ * whole chain. If we're trying to reap a btree larger than this size,
+ * a crash midway through reaping can result in leaked blocks.
+ */
+ const unsigned int t1 = rui;
+ const unsigned int t4 = rui + efi;
+ const unsigned int per_intent = max(t1, t4);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of EFI or
+ * RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int step_size = max(f1, f2);
+
+ /* Largest buffer size (in fsblocks) that can be invalidated. */
+ const unsigned int max_binval = xrep_binval_max_fsblocks(mp);
+
+ /* Maximum overhead of invalidating one buffer. */
+ const unsigned int per_binval =
+ xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+ /*
+ * For each transaction in a reap chain, we can delete some number of
+ * extents and invalidate some number of blocks. We assume that btree
+ * blocks aren't usually contiguous; and that scrub likely pulled all
+ * the buffers into memory. From these assumptions, set the maximum
+ * number of deferrals we can queue before flushing the defer chain,
+ * and the number of invalidations we can queue before rolling to a
+ * clean transaction (and possibly relogging some of the deferrals) to
+ * the same quantity.
+ */
+ const unsigned int variable_overhead = per_intent + per_binval;
+
+ xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+ per_binval);
+
+ trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval,
+ step_size, per_intent, rs->max_deferred);
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent. This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_agcow_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1) +
+ xfs_cud_log_space();
+
+ /*
+ * Various things can happen when reaping non-CoW metadata blocks:
+ *
+ * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount
+ * record, which defers removal of rmap record
+ *
+ * t2: Freeing CoW blocks: deferred removal of refcount record, which
+ * defers removal of rmap record; and deferred removal of the space
+ *
+ * For simplicity, we'll use the worst-case intents size to determine
+ * the maximum number of deferred extents before we have to finish the
+ * whole chain. If we're trying to reap a btree larger than this size,
+ * a crash midway through reaping can result in leaked blocks.
+ */
+ const unsigned int t0 = cui + rui;
+ const unsigned int t2 = cui + rui + efi;
+ const unsigned int per_intent = max(t0, t2);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of CUI, EFI,
+ * or RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
+ const unsigned int step_size = max3(f1, f2, f3);
+
+ /* Largest buffer size (in fsblocks) that can be invalidated. */
+ const unsigned int max_binval = xrep_binval_max_fsblocks(mp);
+
+ /* Overhead of invalidating one buffer */
+ const unsigned int per_binval =
+ xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+ /*
+ * For each transaction in a reap chain, we can delete some number of
+ * extents and invalidate some number of blocks. We assume that CoW
+ * staging extents are usually more than 1 fsblock, and that there
+ * shouldn't be any buffers for those blocks. From the assumptions,
+ * set the number of deferrals to use as much of the reservation as
+ * it can, but leave space to invalidate 1/8th that number of buffers.
+ */
+ const unsigned int variable_overhead = per_intent +
+ (per_binval / 8);
+
+ xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+ per_binval);
+
+ trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size,
+ per_intent, rs->max_deferred);
+}
+
/*
* Break an AG metadata extent into sub-extents by fate (crosslinked, not
* crosslinked), and dispose of each sub-extent separately.
@@ -531,11 +736,11 @@ xreap_agmeta_extent(
if (error)
return error;
xreap_defer_finish_reset(rs);
- } else if (xreap_want_roll(rs)) {
+ } else if (xreap_want_binval_roll(rs)) {
error = xrep_roll_ag_trans(sc);
if (error)
return error;
- xreap_reset(rs);
+ xreap_binval_reset(rs);
}
agbno += aglen;
@@ -562,11 +767,12 @@ xrep_reap_agblocks(
ASSERT(xfs_has_rmapbt(sc->mp));
ASSERT(sc->ip == NULL);
+ xreap_configure_agextent_limits(&rs);
error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs))
+ if (xreap_is_dirty(&rs))
return xrep_defer_finish(sc);
return 0;
@@ -628,7 +834,7 @@ xreap_fsmeta_extent(
if (error)
goto out_agf;
xreap_defer_finish_reset(rs);
- } else if (xreap_want_roll(rs)) {
+ } else if (xreap_want_binval_roll(rs)) {
/*
* Hold the AGF buffer across the transaction roll so
* that we don't have to reattach it to the scrub
@@ -639,7 +845,7 @@ xreap_fsmeta_extent(
xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
if (error)
goto out_agf;
- xreap_reset(rs);
+ xreap_binval_reset(rs);
}
agbno += aglen;
@@ -674,11 +880,15 @@ xrep_reap_fsblocks(
ASSERT(xfs_has_rmapbt(sc->mp));
ASSERT(sc->ip != NULL);
+ if (oinfo == &XFS_RMAP_OINFO_COW)
+ xreap_configure_agcow_limits(&rs);
+ else
+ xreap_configure_agextent_limits(&rs);
error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs))
+ if (xreap_is_dirty(&rs))
return xrep_defer_finish(sc);
return 0;
@@ -770,7 +980,7 @@ xreap_rgextent_iter(
rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
/*
- * If there are other rmappings, this block is cross linked and must
+ * t1: There are other rmappings; this block is cross linked and must
* not be freed. Remove the forward and reverse mapping and move on.
*/
if (crosslinked) {
@@ -778,14 +988,14 @@ xreap_rgextent_iter(
*rglenp);
xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
- rs->deferred++;
+ xreap_inc_defer(rs);
return 0;
}
trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
/*
- * The CoW staging extent is not crosslinked. Use deferred work items
+ * t2: The CoW staging extent is not crosslinked. Use deferred work
* to remove the refcountbt records (which removes the rmap records)
* and free the extent. We're not worried about the system going down
* here because log recovery walks the refcount btree to clean out the
@@ -799,10 +1009,73 @@ xreap_rgextent_iter(
if (error)
return error;
- rs->deferred++;
+ xreap_inc_defer(rs);
return 0;
}
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent. This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_rgcow_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1) +
+ xfs_cud_log_space();
+
+ /*
+ * Various things can happen when reaping non-CoW metadata blocks:
+ *
+ * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount
+ * record, which defers removal of rmap record
+ *
+ * t2: Freeing CoW blocks: deferred removal of refcount record, which
+ * defers removal of rmap record; and deferred removal of the space
+ *
+ * For simplicity, we'll use the worst-case intents size to determine
+ * the maximum number of deferred extents before we have to finish the
+ * whole chain. If we're trying to reap a btree larger than this size,
+ * a crash midway through reaping can result in leaked blocks.
+ */
+ const unsigned int t1 = cui + rui;
+ const unsigned int t2 = cui + rui + efi;
+ const unsigned int per_intent = max(t1, t2);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of CUI, EFI,
+ * or RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1);
+ const unsigned int step_size = max3(f1, f2, f3);
+
+ /*
+ * The only buffer for the rt device is the rtgroup super, so we don't
+ * need to save space for buffer invalidations.
+ */
+ xreap_configure_limits(rs, step_size, per_intent, per_intent, 0);
+
+ trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent,
+ rs->max_deferred);
+}
+
#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
XFS_RTGLOCK_RMAP | \
XFS_RTGLOCK_REFCOUNT)
@@ -855,11 +1128,11 @@ xreap_rtmeta_extent(
if (error)
goto out_unlock;
xreap_defer_finish_reset(rs);
- } else if (xreap_want_roll(rs)) {
+ } else if (xreap_want_binval_roll(rs)) {
error = xfs_trans_roll_inode(&sc->tp, sc->ip);
if (error)
goto out_unlock;
- xreap_reset(rs);
+ xreap_binval_reset(rs);
}
rgbno += rglen;
@@ -891,12 +1164,14 @@ xrep_reap_rtblocks(
ASSERT(xfs_has_rmapbt(sc->mp));
ASSERT(sc->ip != NULL);
+ ASSERT(oinfo == &XFS_RMAP_OINFO_COW);
+ xreap_configure_rgcow_limits(&rs);
error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs))
+ if (xreap_is_dirty(&rs))
return xrep_defer_finish(sc);
return 0;
@@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks(
ASSERT(sc->ip != NULL);
ASSERT(xfs_is_metadir_inode(sc->ip));
+ xreap_configure_agextent_limits(&rs);
xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
-
error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs)) {
+ if (xreap_is_dirty(&rs)) {
error = xrep_defer_finish(sc);
if (error)
return error;
@@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks(
*/
STATIC int
xreap_bmapi_select(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap,
bool *crosslinked)
{
struct xfs_owner_info oinfo;
+ struct xfs_scrub *sc = rs->sc;
struct xfs_btree_cur *cur;
xfs_filblks_t len = 1;
xfs_agblock_t bno;
@@ -975,7 +1249,8 @@ xreap_bmapi_select(
cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
sc->sa.pag);
- xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+ xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork,
+ imap->br_startoff);
error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
if (error)
goto out_cur;
@@ -1038,21 +1313,19 @@ xreap_buf_loggable(
*/
STATIC int
xreap_bmapi_binval(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap)
{
+ struct xfs_scrub *sc = rs->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_perag *pag = sc->sa.pag;
- int bmap_flags = xfs_bmapi_aflag(whichfork);
+ int bmap_flags = xfs_bmapi_aflag(rs->whichfork);
xfs_fileoff_t off;
xfs_fileoff_t max_off;
xfs_extlen_t scan_blocks;
xfs_agblock_t bno;
xfs_agblock_t agbno;
xfs_agblock_t agbno_next;
- unsigned int invalidated = 0;
int error;
/*
@@ -1079,7 +1352,7 @@ xreap_bmapi_binval(
struct xfs_bmbt_irec hmap;
int nhmaps = 1;
- error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+ error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap,
&nhmaps, bmap_flags);
if (error)
return error;
@@ -1120,14 +1393,13 @@ xreap_bmapi_binval(
xfs_buf_stale(bp);
xfs_buf_relse(bp);
}
- invalidated++;
/*
* Stop invalidating if we've hit the limit; we should
* still have enough reservation left to free however
- * much of the mapping we've seen so far.
+ * far we've gotten.
*/
- if (invalidated > XREAP_MAX_BINVAL) {
+ if (!xreap_inc_binval(rs)) {
imap->br_blockcount = agbno_next - bno;
goto out;
}
@@ -1149,12 +1421,11 @@ out:
*/
STATIC int
xrep_reap_bmapi_iter(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap,
bool crosslinked)
{
+ struct xfs_scrub *sc = rs->sc;
int error;
if (crosslinked) {
@@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter(
imap->br_blockcount);
/*
- * Schedule removal of the mapping from the fork. We use
+ * t0: Schedule removal of the mapping from the fork. We use
* deferred log intents in this function to control the exact
* sequence of metadata updates.
*/
- xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
- xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
-(int64_t)imap->br_blockcount);
- xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
return 0;
}
@@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter(
* transaction is full of logged buffer invalidations, so we need to
* return early so that we can roll and retry.
*/
- error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+ error = xreap_bmapi_binval(rs, imap);
if (error || imap->br_blockcount == 0)
return error;
/*
- * Schedule removal of the mapping from the fork. We use deferred log
- * intents in this function to control the exact sequence of metadata
+ * t1: Schedule removal of the mapping from the fork. We use deferred
+ * work in this function to control the exact sequence of metadata
* updates.
*/
- xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
- xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
-(int64_t)imap->br_blockcount);
return xfs_free_extent_later(sc->tp, imap->br_startblock,
imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
XFS_FREE_EXTENT_SKIP_DISCARD);
}
+/* Compute the maximum mapcount of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_mapcount(
+ struct xfs_scrub *sc)
+{
+ /* directory blocks can span multiple fsblocks and be discontiguous */
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR)
+ return sc->mp->m_dir_geo->fsbcount;
+
+ /* all other file xattr/symlink blocks must be contiguous */
+ return 1;
+}
+
+/* Compute the maximum block size of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_blocksize(
+ struct xfs_scrub *sc)
+{
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_DIR:
+ return sc->mp->m_dir_geo->blksize;
+ case XFS_SCRUB_TYPE_XATTR:
+ case XFS_SCRUB_TYPE_PARENT:
+ /*
+ * The xattr structure itself consists of single fsblocks, but
+ * there could be remote xattr blocks to invalidate.
+ */
+ return XFS_XATTR_SIZE_MAX;
+ }
+
+ /* everything else is a single block */
+ return sc->mp->m_sb.sb_blocksize;
+}
+
+/*
+ * Compute the maximum number of buffer invalidations that we can do while
+ * reaping a single extent from a file fork.
+ */
+STATIC void
+xreap_configure_bmapi_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /* overhead of invalidating a buffer */
+ const unsigned int per_binval =
+ xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc),
+ xreap_bmapi_binval_blocksize(sc));
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+ const unsigned int bui = xfs_bui_log_space(1) +
+ xfs_bud_log_space();
+
+ /*
+ * t1: Unmapping crosslinked file data blocks: one bmap deletion,
+ * possibly an EFI for underfilled bmbt blocks, and an rmap deletion.
+ *
+ * t2: Freeing freeing file data blocks: one bmap deletion, possibly an
+ * EFI for underfilled bmbt blocks, and another EFI for the space
+ * itself.
+ */
+ const unsigned int t1 = (bui + efi) + rui;
+ const unsigned int t2 = (bui + efi) + efi;
+ const unsigned int per_intent = max(t1, t2);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of CUI, EFI,
+ * or RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1);
+ const unsigned int step_size = max3(f1, f2, f3);
+
+ /*
+ * Each call to xreap_ifork_extent starts with a clean transaction and
+ * operates on a single mapping by creating a chain of log intent items
+ * for that mapping. We need to leave enough reservation in the
+ * transaction to log btree buffer and inode updates for each step in
+ * the chain, and to relog the log intents.
+ */
+ const unsigned int per_extent_res = per_intent + step_size;
+
+ xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval);
+
+ trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval,
+ step_size, per_intent, 1);
+}
+
/*
* Dispose of as much of this file extent as we can. Upon successful return,
* the imap will reflect the mapping that was removed from the fork.
*/
STATIC int
xreap_ifork_extent(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap)
{
+ struct xfs_scrub *sc = rs->sc;
xfs_agnumber_t agno;
bool crosslinked;
int error;
ASSERT(sc->sa.pag == NULL);
- trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+ trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap);
agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
sc->sa.pag = xfs_perag_get(sc->mp, agno);
@@ -1248,11 +1617,11 @@ xreap_ifork_extent(
* Decide the fate of the blocks at the beginning of the mapping, then
* update the mapping to use it with the unmap calls.
*/
- error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+ error = xreap_bmapi_select(rs, imap, &crosslinked);
if (error)
goto out_agf;
- error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+ error = xrep_reap_bmapi_iter(rs, imap, crosslinked);
if (error)
goto out_agf;
@@ -1276,6 +1645,11 @@ xrep_reap_ifork(
struct xfs_inode *ip,
int whichfork)
{
+ struct xreap_state rs = {
+ .sc = sc,
+ .ip = ip,
+ .whichfork = whichfork,
+ };
xfs_fileoff_t off = 0;
int bmap_flags = xfs_bmapi_aflag(whichfork);
int error;
@@ -1284,6 +1658,7 @@ xrep_reap_ifork(
ASSERT(ip == sc->ip || ip == sc->tempip);
ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
+ xreap_configure_bmapi_limits(&rs);
while (off < XFS_MAX_FILEOFF) {
struct xfs_bmbt_irec imap;
int nimaps = 1;
@@ -1303,13 +1678,14 @@ xrep_reap_ifork(
* can in a single transaction.
*/
if (xfs_bmap_is_real_extent(&imap)) {
- error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+ error = xreap_ifork_extent(&rs, &imap);
if (error)
return error;
error = xfs_defer_finish(&sc->tp);
if (error)
return error;
+ xreap_defer_finish_reset(&rs);
}
off = imap.br_startoff + imap.br_blockcount;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index d00c18954a26..efd5a7ccdf62 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1110,7 +1110,7 @@ xrep_will_attempt(
return true;
/* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
return true;
/* Metadata is corrupt or failed cross-referencing. */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9c04295742c8..2bb125c4f9bf 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
#ifdef CONFIG_XFS_ONLINE_REPAIR
-/*
- * This is the maximum number of deferred extent freeing item extents (EFIs)
- * that we'll attach to a transaction without rolling the transaction to avoid
- * overrunning a tr_itruncate reservation.
- */
-#define XREP_MAX_ITRUNCATE_EFIS (128)
-
-
/* Repair helpers */
int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index 953ce7be78dc..5902398185a8 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -185,7 +185,7 @@ xrep_symlink_salvage_inline(
return 0;
nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
- strncpy(target_buf, ifp->if_data, nr);
+ memcpy(target_buf, ifp->if_data, nr);
return nr;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 2450e214103f..987313a52e64 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,6 +22,7 @@
#include "xfs_parent.h"
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
+#include "xfs_trans.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index a8187281eb96..39ea651cbb75 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
+DECLARE_EVENT_CLASS(xrep_reap_limits_class,
+ TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval,
+ unsigned int max_binval, unsigned int step_size,
+ unsigned int per_intent,
+ unsigned int max_deferred),
+ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, log_res)
+ __field(unsigned int, per_binval)
+ __field(unsigned int, max_binval)
+ __field(unsigned int, step_size)
+ __field(unsigned int, per_intent)
+ __field(unsigned int, max_deferred)
+ ),
+ TP_fast_assign(
+ __entry->dev = tp->t_mountp->m_super->s_dev;
+ __entry->log_res = tp->t_log_res;
+ __entry->per_binval = per_binval;
+ __entry->max_binval = max_binval;
+ __entry->step_size = step_size;
+ __entry->per_intent = per_intent;
+ __entry->max_deferred = max_deferred;
+ ),
+ TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->log_res,
+ __entry->per_binval,
+ __entry->max_binval,
+ __entry->step_size,
+ __entry->per_intent,
+ __entry->max_deferred)
+);
+#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \
+DEFINE_EVENT(xrep_reap_limits_class, name, \
+ TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \
+ unsigned int max_binval, unsigned int step_size, \
+ unsigned int per_intent, \
+ unsigned int max_deferred), \
+ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred))
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits);
+
DECLARE_EVENT_CLASS(xrep_reap_find_class,
TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
xfs_extlen_t len, bool crosslinked),
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 5eef3bc30bda..c3a593319bee 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -491,7 +491,7 @@ xfs_attr_finish_item(
/* Reset trans after EAGAIN cycle since the transaction is new */
args->trans = tp;
- if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) {
error = -EIO;
goto out;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f9ef3b2a332a..773d959965dc 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -387,8 +387,6 @@ xfs_buf_map_verify(
struct xfs_buftarg *btp,
struct xfs_buf_map *map)
{
- xfs_daddr_t eofs;
-
/* Check for IOs smaller than the sector size / not sector aligned */
ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
@@ -397,11 +395,10 @@ xfs_buf_map_verify(
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
- eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (map->bm_bn < 0 || map->bm_bn >= eofs) {
+ if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
- __func__, map->bm_bn, eofs);
+ __func__, map->bm_bn, btp->bt_nr_sectors);
WARN_ON(1);
return -EFSCORRUPTED;
}
@@ -1299,7 +1296,7 @@ xfs_buf_bio_end_io(
if (bio->bi_status)
xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
- XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+ XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
xfs_buf_ioerror(bp, -EIO);
if (bp->b_flags & XBF_ASYNC) {
@@ -1720,26 +1717,30 @@ xfs_configure_buftarg_atomic_writes(
int
xfs_configure_buftarg(
struct xfs_buftarg *btp,
- unsigned int sectorsize)
+ unsigned int sectorsize,
+ xfs_rfsblock_t nr_blocks)
{
- int error;
+ struct xfs_mount *mp = btp->bt_mount;
- ASSERT(btp->bt_bdev != NULL);
+ if (btp->bt_bdev) {
+ int error;
- /* Set up metadata sector size info */
- btp->bt_meta_sectorsize = sectorsize;
- btp->bt_meta_sectormask = sectorsize - 1;
+ error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
+ if (error) {
+ xfs_warn(mp,
+ "Cannot use blocksize %u on device %pg, err %d",
+ sectorsize, btp->bt_bdev, error);
+ return -EINVAL;
+ }
- error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
- if (error) {
- xfs_warn(btp->bt_mount,
- "Cannot use blocksize %u on device %pg, err %d",
- sectorsize, btp->bt_bdev, error);
- return -EINVAL;
+ if (bdev_can_atomic_write(btp->bt_bdev))
+ xfs_configure_buftarg_atomic_writes(btp);
}
- if (bdev_can_atomic_write(btp->bt_bdev))
- xfs_configure_buftarg_atomic_writes(btp);
+ btp->bt_meta_sectorsize = sectorsize;
+ btp->bt_meta_sectormask = sectorsize - 1;
+ /* m_blkbb_log is not set up yet */
+ btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT);
return 0;
}
@@ -1749,6 +1750,9 @@ xfs_init_buftarg(
size_t logical_sectorsize,
const char *descr)
{
+ /* The maximum size of the buftarg is only known once the sb is read. */
+ btp->bt_nr_sectors = (xfs_daddr_t)-1;
+
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
btp->bt_logical_sectormask = logical_sectorsize - 1;
@@ -2084,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
* This allows userspace to disrupt buffer caching for debug/testing
* purposes.
*/
- if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
+ if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
lru_ref = 0;
atomic_set(&bp->b_lru_ref, lru_ref);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b269e115d9ac..8fa7bdf59c91 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -103,6 +103,7 @@ struct xfs_buftarg {
size_t bt_meta_sectormask;
size_t bt_logical_sectorsize;
size_t bt_logical_sectormask;
+ xfs_daddr_t bt_nr_sectors;
/* LRU control structures */
struct shrinker *bt_shrinker;
@@ -372,7 +373,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
-int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
+int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize,
+ xfs_fsblock_t nr_blocks);
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 5d58e2ae4972..e4c8af873632 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer(
*/
xfs_sb_from_disk(&mp->m_sb, dsb);
+ /*
+ * Grow can change the device size. Mirror that into the buftarg.
+ */
+ mp->m_ddev_targp->bt_nr_sectors =
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) {
+ mp->m_rtdev_targp->bt_nr_sectors =
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ }
+
if (mp->m_sb.sb_agcount < orig_agcount) {
xfs_alert(mp, "Shrinking AG count in log recovery not supported");
return -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index dbd87e137694..39830b252ac8 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -10,61 +10,17 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_sysfs.h"
#include "xfs_inode.h"
#ifdef DEBUG
-static unsigned int xfs_errortag_random_default[] = {
- XFS_RANDOM_DEFAULT,
- XFS_RANDOM_IFLUSH_1,
- XFS_RANDOM_IFLUSH_2,
- XFS_RANDOM_IFLUSH_3,
- XFS_RANDOM_IFLUSH_4,
- XFS_RANDOM_IFLUSH_5,
- XFS_RANDOM_IFLUSH_6,
- XFS_RANDOM_DA_READ_BUF,
- XFS_RANDOM_BTREE_CHECK_LBLOCK,
- XFS_RANDOM_BTREE_CHECK_SBLOCK,
- XFS_RANDOM_ALLOC_READ_AGF,
- XFS_RANDOM_IALLOC_READ_AGI,
- XFS_RANDOM_ITOBP_INOTOBP,
- XFS_RANDOM_IUNLINK,
- XFS_RANDOM_IUNLINK_REMOVE,
- XFS_RANDOM_DIR_INO_VALIDATE,
- XFS_RANDOM_BULKSTAT_READ_CHUNK,
- XFS_RANDOM_IODONE_IOERR,
- XFS_RANDOM_STRATREAD_IOERR,
- XFS_RANDOM_STRATCMPL_IOERR,
- XFS_RANDOM_DIOWRITE_IOERR,
- XFS_RANDOM_BMAPIFORMAT,
- XFS_RANDOM_FREE_EXTENT,
- XFS_RANDOM_RMAP_FINISH_ONE,
- XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE,
- XFS_RANDOM_REFCOUNT_FINISH_ONE,
- XFS_RANDOM_BMAP_FINISH_ONE,
- XFS_RANDOM_AG_RESV_CRITICAL,
- 0, /* XFS_RANDOM_DROP_WRITES has been removed */
- XFS_RANDOM_LOG_BAD_CRC,
- XFS_RANDOM_LOG_ITEM_PIN,
- XFS_RANDOM_BUF_LRU_REF,
- XFS_RANDOM_FORCE_SCRUB_REPAIR,
- XFS_RANDOM_FORCE_SUMMARY_RECALC,
- XFS_RANDOM_IUNLINK_FALLBACK,
- XFS_RANDOM_BUF_IOERROR,
- XFS_RANDOM_REDUCE_MAX_IEXTENTS,
- XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
- XFS_RANDOM_AG_RESV_FAIL,
- XFS_RANDOM_LARP,
- XFS_RANDOM_DA_LEAF_SPLIT,
- XFS_RANDOM_ATTR_LEAF_TO_NODE,
- XFS_RANDOM_WB_DELAY_MS,
- XFS_RANDOM_WRITE_DELAY_MS,
- XFS_RANDOM_EXCHMAPS_FINISH_ONE,
- XFS_RANDOM_METAFILE_RESV_CRITICAL,
-};
+#define XFS_ERRTAG(_tag, _name, _default) \
+ [XFS_ERRTAG_##_tag] = (_default),
+#include "xfs_errortag.h"
+static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS };
+#undef XFS_ERRTAG
struct xfs_errortag_attr {
struct attribute attr;
@@ -93,21 +49,18 @@ xfs_errortag_attr_store(
size_t count)
{
struct xfs_mount *mp = to_mp(kobject);
- struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+ unsigned int error_tag = to_attr(attr)->tag;
int ret;
- unsigned int val;
if (strcmp(buf, "default") == 0) {
- val = xfs_errortag_random_default[xfs_attr->tag];
+ mp->m_errortag[error_tag] =
+ xfs_errortag_random_default[error_tag];
} else {
- ret = kstrtouint(buf, 0, &val);
+ ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]);
if (ret)
return ret;
}
- ret = xfs_errortag_set(mp, xfs_attr->tag, val);
- if (ret)
- return ret;
return count;
}
@@ -118,10 +71,9 @@ xfs_errortag_attr_show(
char *buf)
{
struct xfs_mount *mp = to_mp(kobject);
- struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+ unsigned int error_tag = to_attr(attr)->tag;
- return snprintf(buf, PAGE_SIZE, "%u\n",
- xfs_errortag_get(mp, xfs_attr->tag));
+ return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]);
}
static const struct sysfs_ops xfs_errortag_sysfs_ops = {
@@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = {
.store = xfs_errortag_attr_store,
};
-#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
+#define XFS_ERRTAG(_tag, _name, _default) \
static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \
.attr = {.name = __stringify(_name), \
.mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \
- .tag = (_tag), \
-}
-
-#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
-
-XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR);
-XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1);
-XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2);
-XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3);
-XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4);
-XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5);
-XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6);
-XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF);
-XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK);
-XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK);
-XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF);
-XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI);
-XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP);
-XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK);
-XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE);
-XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE);
-XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK);
-XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR);
-XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR);
-XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT);
-XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT);
-XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE);
-XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
-XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
-XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
-XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
-XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
-XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
-XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
-XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
-XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
-XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
-XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
-XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
-XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
-XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+ .tag = XFS_ERRTAG_##_tag, \
+};
+#include "xfs_errortag.h"
+XFS_ERRTAGS
+#undef XFS_ERRTAG
+#define XFS_ERRTAG(_tag, _name, _default) \
+ &xfs_errortag_attr_##_name.attr,
+#include "xfs_errortag.h"
static struct attribute *xfs_errortag_attrs[] = {
- XFS_ERRORTAG_ATTR_LIST(noerror),
- XFS_ERRORTAG_ATTR_LIST(iflush1),
- XFS_ERRORTAG_ATTR_LIST(iflush2),
- XFS_ERRORTAG_ATTR_LIST(iflush3),
- XFS_ERRORTAG_ATTR_LIST(iflush4),
- XFS_ERRORTAG_ATTR_LIST(iflush5),
- XFS_ERRORTAG_ATTR_LIST(iflush6),
- XFS_ERRORTAG_ATTR_LIST(dareadbuf),
- XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk),
- XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk),
- XFS_ERRORTAG_ATTR_LIST(readagf),
- XFS_ERRORTAG_ATTR_LIST(readagi),
- XFS_ERRORTAG_ATTR_LIST(itobp),
- XFS_ERRORTAG_ATTR_LIST(iunlink),
- XFS_ERRORTAG_ATTR_LIST(iunlinkrm),
- XFS_ERRORTAG_ATTR_LIST(dirinovalid),
- XFS_ERRORTAG_ATTR_LIST(bulkstat),
- XFS_ERRORTAG_ATTR_LIST(logiodone),
- XFS_ERRORTAG_ATTR_LIST(stratread),
- XFS_ERRORTAG_ATTR_LIST(stratcmpl),
- XFS_ERRORTAG_ATTR_LIST(diowrite),
- XFS_ERRORTAG_ATTR_LIST(bmapifmt),
- XFS_ERRORTAG_ATTR_LIST(free_extent),
- XFS_ERRORTAG_ATTR_LIST(rmap_finish_one),
- XFS_ERRORTAG_ATTR_LIST(refcount_continue_update),
- XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
- XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
- XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
- XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
- XFS_ERRORTAG_ATTR_LIST(log_item_pin),
- XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
- XFS_ERRORTAG_ATTR_LIST(force_repair),
- XFS_ERRORTAG_ATTR_LIST(bad_summary),
- XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
- XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
- XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
- XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
- XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
- XFS_ERRORTAG_ATTR_LIST(larp),
- XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
- XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
- XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
- XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
- XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
- XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit),
- NULL,
+ XFS_ERRTAGS
+ NULL
};
ATTRIBUTE_GROUPS(xfs_errortag);
+#undef XFS_ERRTAG
+
+/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */
+static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX);
static const struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
@@ -295,7 +165,6 @@ xfs_errortag_enabled(
bool
xfs_errortag_test(
struct xfs_mount *mp,
- const char *expression,
const char *file,
int line,
unsigned int error_tag)
@@ -321,36 +190,12 @@ xfs_errortag_test(
return false;
xfs_warn_ratelimited(mp,
-"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
- expression, file, line, mp->m_super->s_id);
+"Injecting error at file %s, line %d, on filesystem \"%s\"",
+ file, line, mp->m_super->s_id);
return true;
}
int
-xfs_errortag_get(
- struct xfs_mount *mp,
- unsigned int error_tag)
-{
- if (!xfs_errortag_valid(error_tag))
- return -EINVAL;
-
- return mp->m_errortag[error_tag];
-}
-
-int
-xfs_errortag_set(
- struct xfs_mount *mp,
- unsigned int error_tag,
- unsigned int tag_value)
-{
- if (!xfs_errortag_valid(error_tag))
- return -EINVAL;
-
- mp->m_errortag[error_tag] = tag_value;
- return 0;
-}
-
-int
xfs_errortag_add(
struct xfs_mount *mp,
unsigned int error_tag)
@@ -359,9 +204,8 @@ xfs_errortag_add(
if (!xfs_errortag_valid(error_tag))
return -EINVAL;
-
- return xfs_errortag_set(mp, error_tag,
- xfs_errortag_random_default[error_tag]);
+ mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag];
+ return 0;
}
int
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0b9c5ba8a598..fe6a71bbe9cd 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -8,22 +8,17 @@
struct xfs_mount;
-extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
- const char *filename, int linenum,
- xfs_failaddr_t failaddr);
-extern void xfs_corruption_error(const char *tag, int level,
- struct xfs_mount *mp, const void *buf, size_t bufsize,
- const char *filename, int linenum,
- xfs_failaddr_t failaddr);
+void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+ const char *filename, int linenum, xfs_failaddr_t failaddr);
+void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp,
+ const void *buf, size_t bufsize, const char *filename,
+ int linenum, xfs_failaddr_t failaddr);
void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
-extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
- const char *name, const void *buf, size_t bufsz,
- xfs_failaddr_t failaddr);
-extern void xfs_verifier_error(struct xfs_buf *bp, int error,
- xfs_failaddr_t failaddr);
-extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
- const char *name, const void *buf, size_t bufsz,
- xfs_failaddr_t failaddr);
+void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name,
+ const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
+void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr);
+void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name,
+ const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
#define XFS_ERROR_REPORT(e, lvl, mp) \
xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
@@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
#define XFS_CORRUPTION_DUMP_LEN (128)
#ifdef DEBUG
-extern int xfs_errortag_init(struct xfs_mount *mp);
-extern void xfs_errortag_del(struct xfs_mount *mp);
-extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
- const char *file, int line, unsigned int error_tag);
-#define XFS_TEST_ERROR(expr, mp, tag) \
- ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+int xfs_errortag_init(struct xfs_mount *mp);
+void xfs_errortag_del(struct xfs_mount *mp);
+bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line,
+ unsigned int error_tag);
+#define XFS_TEST_ERROR(mp, tag) \
+ xfs_errortag_test((mp), __FILE__, __LINE__, (tag))
bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
#define XFS_ERRORTAG_DELAY(mp, tag) \
do { \
@@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
mdelay((mp)->m_errortag[(tag)]); \
} while (0)
-extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
- unsigned int tag_value);
-extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_clearall(struct xfs_mount *mp);
+int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
+int xfs_errortag_clearall(struct xfs_mount *mp);
#else
#define xfs_errortag_init(mp) (0)
#define xfs_errortag_del(mp)
-#define XFS_TEST_ERROR(expr, mp, tag) (expr)
+#define XFS_TEST_ERROR(mp, tag) (false)
#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0)
-#define xfs_errortag_set(mp, tag, val) (ENOSYS)
#define xfs_errortag_add(mp, tag) (ENOSYS)
#define xfs_errortag_clearall(mp) (ENOSYS)
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 47ee598a9827..418ddab590e0 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -202,7 +202,7 @@ xfs_efi_copy_format(
sizeof(struct xfs_extent));
return 0;
} else if (buf->iov_len == len32) {
- xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base;
+ struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -216,7 +216,7 @@ xfs_efi_copy_format(
}
return 0;
} else if (buf->iov_len == len64) {
- xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base;
+ struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index c8402040410b..af1b0331f7af 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -49,7 +49,7 @@ struct xfs_efi_log_item {
struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
- xfs_efi_log_format_t efi_format;
+ struct xfs_efi_log_format efi_format;
};
static inline size_t
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
struct xfs_log_item efd_item;
struct xfs_efi_log_item *efd_efip;
uint efd_next_extent;
- xfs_efd_log_format_t efd_format;
+ struct xfs_efd_log_format efd_format;
};
static inline size_t
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f96fbf5c54c9..2702fef2c90c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -75,52 +75,47 @@ xfs_dir_fsync(
return xfs_log_force_inode(ip);
}
-static xfs_csn_t
-xfs_fsync_seq(
- struct xfs_inode *ip,
- bool datasync)
-{
- if (!xfs_ipincount(ip))
- return 0;
- if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- return 0;
- return ip->i_itemp->ili_commit_seq;
-}
-
/*
- * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
*
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning. If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk. We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
+ *
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
*/
-static int
+static int
xfs_fsync_flush_log(
struct xfs_inode *ip,
bool datasync,
int *log_flushed)
{
- int error = 0;
- xfs_csn_t seq;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- seq = xfs_fsync_seq(ip, datasync);
- if (seq) {
- error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
- log_flushed);
+ spin_lock(&iip->ili_lock);
+ if (datasync)
+ seq = iip->ili_datasync_seq;
+ else
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
- spin_lock(&ip->i_itemp->ili_lock);
- ip->i_itemp->ili_fsync_fields = 0;
- spin_unlock(&ip->i_itemp->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
+ if (!seq)
+ return 0;
+
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
}
STATIC int
@@ -158,12 +153,10 @@ xfs_file_fsync(
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
- * Any inode that has dirty modifications in the log is pinned. The
- * racy check here for a pinned inode will not catch modifications
- * that happen concurrently to the fsync call, but fsync semantics
- * only require to sync previously completed I/O.
+ * If the inode has a inode log item attached, it may need the journal
+ * flushed to persist any changes the log item might be tracking.
*/
- if (xfs_ipincount(ip)) {
+ if (ip->i_itemp) {
err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
if (err2 && !error)
error = err2;
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f6f628c01feb..566fd663c95b 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -14,8 +14,6 @@
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
- .sgid_inherit = { 0, 0, 1 },
- .symlink_mode = { 0, 0, 1 },
.panic_mask = { 0, 0, XFS_PTAG_MASK},
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 4cf7abe50143..e44040206851 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -646,8 +646,7 @@ xfs_iget_cache_miss(
goto out_destroy;
/*
- * For version 5 superblocks, if we are initialising a new inode and we
- * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
+ * For version 5 superblocks, if we are initialising a new inode, we
* simply build the new inode core with a random generation number.
*
* For version 4 (and older) superblocks, log recovery is dependent on
@@ -655,8 +654,7 @@ xfs_iget_cache_miss(
* value and hence we must also read the inode off disk even when
* initializing new inodes.
*/
- if (xfs_has_v3inodes(mp) &&
- (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) {
VFS_I(ip)->i_generation = get_random_u32();
} else {
struct xfs_buf *bp;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9c39251961a3..36b39539e561 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -877,6 +877,35 @@ xfs_create_tmpfile(
return error;
}
+static inline int
+xfs_projid_differ(
+ struct xfs_inode *tdp,
+ struct xfs_inode *sip)
+{
+ /*
+ * If we are using project inheritance, we only allow hard link/renames
+ * creation in our tree when the project IDs are the same; else
+ * the tree quota mechanism could be circumvented.
+ */
+ if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+ tdp->i_projid != sip->i_projid)) {
+ /*
+ * Project quota setup skips special files which can
+ * leave inodes in a PROJINHERIT directory without a
+ * project ID set. We need to allow links to be made
+ * to these "project-less" inodes because userspace
+ * expects them to succeed after project ID setup,
+ * but everything else should be rejected.
+ */
+ if (!special_file(VFS_I(sip)->i_mode) ||
+ sip->i_projid != 0) {
+ return -EXDEV;
+ }
+ }
+
+ return 0;
+}
+
int
xfs_link(
struct xfs_inode *tdp,
@@ -930,27 +959,9 @@ xfs_link(
goto error_return;
}
- /*
- * If we are using project inheritance, we only allow hard link
- * creation in our tree when the project IDs are the same; else
- * the tree quota mechanism could be circumvented.
- */
- if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
- tdp->i_projid != sip->i_projid)) {
- /*
- * Project quota setup skips special files which can
- * leave inodes in a PROJINHERIT directory without a
- * project ID set. We need to allow links to be made
- * to these "project-less" inodes because userspace
- * expects them to succeed after project ID setup,
- * but everything else should be rejected.
- */
- if (!special_file(VFS_I(sip)->i_mode) ||
- sip->i_projid != 0) {
- error = -EXDEV;
- goto error_return;
- }
- }
+ error = xfs_projid_differ(tdp, sip);
+ if (error)
+ goto error_return;
error = xfs_dir_add_child(tp, resblks, &du);
if (error)
@@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags(
int error = 0;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- if (atomic_read(&VFS_I(ip)->i_count))
+ if (icount_read(VFS_I(ip)))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1656,7 +1667,6 @@ retry:
spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
spin_unlock(&iip->ili_lock);
ASSERT(iip->ili_last_fields);
@@ -1821,12 +1831,20 @@ static void
xfs_iunpin(
struct xfs_inode *ip)
{
- xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+
+ spin_lock(&iip->ili_lock);
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
+ if (!seq)
+ return;
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+ xfs_log_force_seq(ip->i_mount, seq, 0, NULL);
}
@@ -2227,16 +2245,9 @@ retry:
if (du_wip.ip)
xfs_trans_ijoin(tp, du_wip.ip, 0);
- /*
- * If we are using project inheritance, we only allow renames
- * into our tree when the project IDs are the same; else the
- * tree quota mechanism would be circumvented.
- */
- if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
- target_dp->i_projid != src_ip->i_projid)) {
- error = -EXDEV;
+ error = xfs_projid_differ(target_dp, src_ip);
+ if (error)
goto out_trans_cancel;
- }
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE) {
@@ -2377,8 +2388,8 @@ xfs_iflush(
* error handling as the caller will shutdown and fail the buffer.
*/
error = -EFSCORRUPTED;
- if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
- mp, XFS_ERRTAG_IFLUSH_1)) {
+ if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
@@ -2394,29 +2405,27 @@ xfs_iflush(
goto flush_out;
}
} else if (S_ISREG(VFS_I(ip)->i_mode)) {
- if (XFS_TEST_ERROR(
- ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
- mp, XFS_ERRTAG_IFLUSH_3)) {
+ if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad regular inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
- if (XFS_TEST_ERROR(
- ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
- ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
- mp, XFS_ERRTAG_IFLUSH_4)) {
+ if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad directory inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
- ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
+ if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
+ ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %llu, "
"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
@@ -2425,8 +2434,8 @@ xfs_iflush(
ip->i_nblocks, ip);
goto flush_out;
}
- if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
- mp, XFS_ERRTAG_IFLUSH_6)) {
+ if (ip->i_forkoff > mp->m_sb.sb_inodesize ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, ip->i_forkoff, ip);
@@ -2502,7 +2511,6 @@ flush_out:
spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
spin_unlock(&iip->ili_lock);
@@ -2661,12 +2669,15 @@ int
xfs_log_force_inode(
struct xfs_inode *ip)
{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
xfs_csn_t seq = 0;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- seq = ip->i_itemp->ili_commit_seq;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (!iip)
+ return 0;
+
+ spin_lock(&iip->ili_lock);
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
if (!seq)
return 0;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 829675700fcd..1bd411a1114c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -131,46 +131,28 @@ xfs_inode_item_precommit(
}
/*
- * Inode verifiers do not check that the extent size hint is an integer
- * multiple of the rt extent size on a directory with both rtinherit
- * and extszinherit flags set. If we're logging a directory that is
- * misconfigured in this way, clear the hint.
+ * Inode verifiers do not check that the extent size hints are an
+ * integer multiple of the rt extent size on a directory with
+ * rtinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the bad hints.
*/
- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
- XFS_DIFLAG_EXTSZINHERIT);
- ip->i_extsize = 0;
- flags |= XFS_ILOG_CORE;
+ if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) {
+ if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+ if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
}
- /*
- * Record the specific change for fdatasync optimisation. This allows
- * fdatasync to skip log forces for inodes that are only timestamp
- * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
- * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
- * (ili_fields) correctly tracks that the version has changed.
- */
spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
- if (flags & XFS_ILOG_IVERSION)
- flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
-
- /*
- * Inode verifiers do not check that the CoW extent size hint is an
- * integer multiple of the rt extent size on a directory with both
- * rtinherit and cowextsize flags set. If we're logging a directory
- * that is misconfigured in this way, clear the hint.
- */
- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
- xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
- ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
- ip->i_cowextsize = 0;
- flags |= XFS_ILOG_CORE;
- }
-
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
@@ -205,6 +187,20 @@ xfs_inode_item_precommit(
}
/*
+ * Store the dirty flags back into the inode item as this state is used
+ * later on in xfs_inode_item_committing() to determine whether the
+ * transaction is relevant to fsync state or not.
+ */
+ iip->ili_dirty_flags = flags;
+
+ /*
+ * Convert the flags on-disk fields that have been modified in the
+ * transaction so that ili_fields tracks the changes correctly.
+ */
+ if (flags & XFS_ILOG_IVERSION)
+ flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+
+ /*
* Always OR in the bits from the ili_last_fields field. This is to
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
* in the eventual clearing of the ili_fields bits. See the big comment
@@ -214,12 +210,6 @@ xfs_inode_item_precommit(
spin_unlock(&iip->ili_lock);
xfs_inode_item_precommit_check(ip);
-
- /*
- * We are done with the log item transaction dirty state, so clear it so
- * that it doesn't pollute future transactions.
- */
- iip->ili_dirty_flags = 0;
return 0;
}
@@ -729,13 +719,24 @@ xfs_inode_item_unpin(
struct xfs_log_item *lip,
int remove)
{
- struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
- if (atomic_dec_and_test(&ip->i_pincount))
+
+ /*
+ * If this is the last unpin, then the inode no longer needs a journal
+ * flush to persist it. Hence we can clear the commit sequence numbers
+ * as a fsync/fdatasync operation on the inode at this point is a no-op.
+ */
+ if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+ iip->ili_commit_seq = 0;
+ iip->ili_datasync_seq = 0;
+ spin_unlock(&iip->ili_lock);
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
}
STATIC uint
@@ -858,12 +859,45 @@ xfs_inode_item_committed(
return lsn;
}
+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_commit_seq = seq;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+ spin_lock(&iip->ili_lock);
+ iip->ili_commit_seq = seq;
+ if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+ iip->ili_datasync_seq = seq;
+ spin_unlock(&iip->ili_lock);
+
+ /*
+ * Clear the per-transaction dirty flags now that we have finished
+ * recording the transaction's inode modifications in the CIL and are
+ * about to release and (maybe) unlock the inode.
+ */
+ iip->ili_dirty_flags = 0;
+
return xfs_inode_item_release(lip);
}
@@ -1055,7 +1089,6 @@ xfs_iflush_abort_clean(
{
iip->ili_last_fields = 0;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
iip->ili_flush_lsn = 0;
iip->ili_item.li_buf = NULL;
list_del_init(&iip->ili_item.li_bio_list);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index ba92ce11a011..2ddcca41714f 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -32,9 +32,17 @@ struct xfs_inode_log_item {
spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
- unsigned int ili_fsync_fields; /* logged since last fsync */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+
+ /*
+ * We record the sequence number for every inode modification, as
+ * well as those that only require fdatasync operations for data
+ * integrity. This allows optimisation of the O_DSYNC/fdatasync path
+ * without needing to track what modifications the journal is currently
+ * carrying for the inode. These are protected by the above ili_lock.
+ */
xfs_csn_t ili_commit_seq; /* last transaction commit */
+ xfs_csn_t ili_datasync_seq; /* for datasync optimisation */
};
static inline int xfs_inode_clean(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index e1051a530a50..a6bb7ee7a27a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -512,9 +512,6 @@ xfs_fileattr_get(
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
- if (d_is_special(dentry))
- return -ENOTTY;
-
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -736,9 +733,6 @@ xfs_fileattr_set(
trace_xfs_ioctl_setattr(ip);
- if (d_is_special(dentry))
- return -ENOTTY;
-
if (!fa->fsx_valid) {
if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
FS_NOATIME_FL | FS_NODUMP_FL |
@@ -1209,21 +1203,21 @@ xfs_file_ioctl(
current->comm);
return -ENOTTY;
case XFS_IOC_DIOINFO: {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct kstat st;
struct dioattr da;
- da.d_mem = target->bt_logical_sectorsize;
+ error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0);
+ if (error)
+ return error;
/*
- * See xfs_report_dioalign() for an explanation about why this
- * reports a value larger than the sector size for COW inodes.
+ * Some userspace directly feeds the return value to
+ * posix_memalign, which fails for values that are smaller than
+ * the pointer size. Round up the value to not break userspace.
*/
- if (xfs_is_cow_inode(ip))
- da.d_miniosz = xfs_inode_alloc_unitsize(ip);
- else
- da.d_miniosz = target->bt_logical_sectorsize;
+ da.d_mem = roundup(st.dio_mem_align, sizeof(void *));
+ da.d_miniosz = st.dio_offset_align;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
if (copy_to_user(arg, &da, sizeof(da)))
return -EFAULT;
return 0;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2a74f2957341..d3f6e3e42a11 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -149,9 +149,18 @@ xfs_bmbt_to_iomap(
iomap->bdev = target->bt_bdev;
iomap->flags = iomap_flags;
- if (xfs_ipincount(ip) &&
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
+ /*
+ * If the inode is dirty for datasync purposes, let iomap know so it
+ * doesn't elide the IO completion journal flushes on O_DSYNC IO.
+ */
+ if (ip->i_itemp) {
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+
+ spin_lock(&iip->ili_lock);
+ if (iip->ili_datasync_seq)
+ iomap->flags |= IOMAP_F_DIRTY;
+ spin_unlock(&iip->ili_lock);
+ }
iomap->validity_cookie = sequence_cookie;
return 0;
@@ -1554,7 +1563,7 @@ xfs_zoned_buffered_write_iomap_begin(
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
@@ -1728,7 +1737,7 @@ xfs_buffered_write_iomap_begin(
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 603effabe1ee..caff0125faea 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -431,14 +431,12 @@ xfs_vn_symlink(
struct dentry *dentry,
const char *symname)
{
- struct inode *inode;
- struct xfs_inode *cip = NULL;
- struct xfs_name name;
- int error;
- umode_t mode;
+ struct inode *inode;
+ struct xfs_inode *cip = NULL;
+ struct xfs_name name;
+ int error;
+ umode_t mode = S_IFLNK | S_IRWXUGO;
- mode = S_IFLNK |
- (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
error = xfs_dentry_mode_to_name(&name, dentry, mode);
if (unlikely(error))
goto out;
@@ -1335,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.setattr = xfs_vn_setattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
/* Figure out if this file actually supports DAX. */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 9a2221b4aa21..4dd747bdbcca 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t;
#undef XFS_NATIVE_HOST
#endif
-#define irix_sgid_inherit xfs_params.sgid_inherit.val
-#define irix_symlink_mode xfs_params.symlink_mode.val
#define xfs_panic_mask xfs_params.panic_mask.val
#define xfs_error_level xfs_params.error_level.val
#define xfs_syncd_centisecs xfs_params.syncd_timer.val
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c8a57e21a1d3..603e85c1ab4c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -969,8 +969,8 @@ xfs_log_unmount_write(
* counters will be recalculated. Refer to xlog_check_unmount_rec for
* more details.
*/
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
xfs_alert(mp, "%s: will fix summary counters at next mount",
__func__);
return;
@@ -1240,7 +1240,7 @@ xlog_ioend_work(
/*
* Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+ if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
@@ -1489,8 +1489,7 @@ xlog_alloc_log(
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
- WQ_HIGHPRI),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU),
0, mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -1568,13 +1567,13 @@ xlog_cksum(
struct xlog *log,
struct xlog_rec_header *rhead,
char *dp,
- int size)
+ unsigned int hdrsize,
+ unsigned int size)
{
uint32_t crc;
/* first generate the crc for the record header ... */
- crc = xfs_start_cksum_update((char *)rhead,
- sizeof(struct xlog_rec_header),
+ crc = xfs_start_cksum_update((char *)rhead, hdrsize,
offsetof(struct xlog_rec_header, h_crc));
/* ... then for additional cycle data for v2 logs ... */
@@ -1818,7 +1817,7 @@ xlog_sync(
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
- iclog->ic_datap, size);
+ iclog->ic_datap, XLOG_REC_SIZE, size);
/*
* Intentionally corrupt the log record CRC based on the error injection
* frequency, if defined. This facilitates testing log recovery in the
@@ -1827,7 +1826,7 @@ xlog_sync(
* detects the bad CRC and attempts to recover.
*/
#ifdef DEBUG
- if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
+ if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
iclog->ic_fail_crc = true;
xfs_warn(log->l_mp,
@@ -2656,10 +2655,11 @@ restart:
* until you know exactly how many bytes get copied. Therefore, wait
* until later to update ic_offset.
*
- * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+ * xlog_write() algorithm assumes that at least 2 xlog_op_header's
* can fit into remaining data section.
*/
- if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+ if (iclog->ic_size - iclog->ic_offset <
+ 2 * sizeof(struct xlog_op_header)) {
int error = 0;
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
@@ -3153,11 +3153,11 @@ xlog_calc_unit_res(
*/
/* for trans header */
- unit_bytes += sizeof(xlog_op_header_t);
- unit_bytes += sizeof(xfs_trans_header_t);
+ unit_bytes += sizeof(struct xlog_op_header);
+ unit_bytes += sizeof(struct xfs_trans_header);
/* for start-rec */
- unit_bytes += sizeof(xlog_op_header_t);
+ unit_bytes += sizeof(struct xlog_op_header);
/*
* for LR headers - the space for data in an iclog is the size minus
@@ -3180,12 +3180,12 @@ xlog_calc_unit_res(
num_headers = howmany(unit_bytes, iclog_space);
/* for split-recs - ophdrs added when data split over LRs */
- unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+ unit_bytes += sizeof(struct xlog_op_header) * num_headers;
/* add extra header reservations if we overrun */
while (!num_headers ||
howmany(unit_bytes, iclog_space) > num_headers) {
- unit_bytes += sizeof(xlog_op_header_t);
+ unit_bytes += sizeof(struct xlog_op_header);
num_headers++;
}
unit_bytes += log->l_iclog_hsize * num_headers;
@@ -3322,7 +3322,7 @@ xlog_verify_iclog(
struct xlog_in_core *iclog,
int count)
{
- xlog_op_header_t *ophead;
+ struct xlog_op_header *ophead;
xlog_in_core_t *icptr;
xlog_in_core_2_t *xhdr;
void *base_ptr, *ptr, *p;
@@ -3400,7 +3400,7 @@ xlog_verify_iclog(
op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
}
}
- ptr += sizeof(xlog_op_header_t) + op_len;
+ ptr += sizeof(struct xlog_op_header) + op_len;
}
}
#endif
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index af6daf4f6792..dcc1f44ed68f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -20,6 +20,43 @@ struct xfs_log_vec {
int lv_alloc_size; /* size of allocated lv */
};
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT 1
+#define XLOG_REG_TYPE_BCHUNK 2
+#define XLOG_REG_TYPE_EFI_FORMAT 3
+#define XLOG_REG_TYPE_EFD_FORMAT 4
+#define XLOG_REG_TYPE_IFORMAT 5
+#define XLOG_REG_TYPE_ICORE 6
+#define XLOG_REG_TYPE_IEXT 7
+#define XLOG_REG_TYPE_IBROOT 8
+#define XLOG_REG_TYPE_ILOCAL 9
+#define XLOG_REG_TYPE_IATTR_EXT 10
+#define XLOG_REG_TYPE_IATTR_BROOT 11
+#define XLOG_REG_TYPE_IATTR_LOCAL 12
+#define XLOG_REG_TYPE_QFORMAT 13
+#define XLOG_REG_TYPE_DQUOT 14
+#define XLOG_REG_TYPE_QUOTAOFF 15
+#define XLOG_REG_TYPE_LRHEADER 16
+#define XLOG_REG_TYPE_UNMOUNT 17
+#define XLOG_REG_TYPE_COMMIT 18
+#define XLOG_REG_TYPE_TRANSHDR 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_RUI_FORMAT 21
+#define XLOG_REG_TYPE_RUD_FORMAT 22
+#define XLOG_REG_TYPE_CUI_FORMAT 23
+#define XLOG_REG_TYPE_CUD_FORMAT 24
+#define XLOG_REG_TYPE_BUI_FORMAT 25
+#define XLOG_REG_TYPE_BUD_FORMAT 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_XMI_FORMAT 31
+#define XLOG_REG_TYPE_XMD_FORMAT 32
+#define XLOG_REG_TYPE_ATTR_NEWNAME 33
+#define XLOG_REG_TYPE_ATTR_NEWVALUE 34
+#define XLOG_REG_TYPE_MAX 34
+
#define XFS_LOG_VEC_ORDERED (-1)
/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index a9a7a271c15b..0cfc654d8e87 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -499,8 +499,8 @@ xlog_recover_finish(
extern void
xlog_recover_cancel(struct xlog *);
-extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
- char *dp, int size);
+__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+ char *dp, unsigned int hdrsize, unsigned int size);
extern struct kmem_cache *xfs_log_ticket_cache;
struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e6ed9e09c027..549d60959aee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2894,20 +2894,34 @@ xlog_recover_process(
int pass,
struct list_head *buffer_list)
{
- __le32 old_crc = rhead->h_crc;
- __le32 crc;
+ __le32 expected_crc = rhead->h_crc, crc, other_crc;
- crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+ crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE,
+ be32_to_cpu(rhead->h_len));
+
+ /*
+ * Look at the end of the struct xlog_rec_header definition in
+ * xfs_log_format.h for the glory details.
+ */
+ if (expected_crc && crc != expected_crc) {
+ other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER,
+ be32_to_cpu(rhead->h_len));
+ if (other_crc == expected_crc) {
+ xfs_notice_once(log->l_mp,
+ "Fixing up incorrect CRC due to padding.");
+ crc = other_crc;
+ }
+ }
/*
* Nothing else to do if this is a CRC verification pass. Just return
* if this a record with a non-zero crc. Unfortunately, mkfs always
- * sets old_crc to 0 so we must consider this valid even on v5 supers.
- * Otherwise, return EFSBADCRC on failure so the callers up the stack
- * know precisely what failed.
+ * sets expected_crc to 0 so we must consider this valid even on v5
+ * supers. Otherwise, return EFSBADCRC on failure so the callers up the
+ * stack know precisely what failed.
*/
if (pass == XLOG_RECOVER_CRCPASS) {
- if (old_crc && crc != old_crc)
+ if (expected_crc && crc != expected_crc)
return -EFSBADCRC;
return 0;
}
@@ -2918,11 +2932,11 @@ xlog_recover_process(
* zero CRC check prevents warnings from being emitted when upgrading
* the kernel from one that does not add CRCs by default.
*/
- if (crc != old_crc) {
- if (old_crc || xfs_has_crc(log->l_mp)) {
+ if (crc != expected_crc) {
+ if (expected_crc || xfs_has_crc(log->l_mp)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
- le32_to_cpu(old_crc),
+ le32_to_cpu(expected_crc),
le32_to_cpu(crc));
xfs_hex_dump(dp, 32);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index dc32c5e34d81..0953f6ae94ab 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1057,19 +1057,6 @@ xfs_mountfs(
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
- /*
- * Now that we've recovered any pending superblock feature bit
- * additions, we can finish setting up the attr2 behaviour for the
- * mount. The noattr2 option overrides the superblock flag, so only
- * check the superblock feature flag if the mount option is not set.
- */
- if (xfs_has_noattr2(mp)) {
- mp->m_features &= ~XFS_FEAT_ATTR2;
- } else if (!xfs_has_attr2(mp) &&
- (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
- mp->m_features |= XFS_FEAT_ATTR2;
- }
-
if (xfs_has_metadir(mp)) {
error = xfs_mount_setup_metadir(mp);
if (error)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 97de44c32272..f046d1215b04 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -363,7 +363,6 @@ typedef struct xfs_mount {
#define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */
#define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */
#define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */
-#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */
#define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */
#define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */
#define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */
@@ -386,7 +385,6 @@ typedef struct xfs_mount {
/* Mount features */
#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
-#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
#define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred
@@ -396,7 +394,6 @@ typedef struct xfs_mount {
#define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */
#define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */
#define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */
-#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/
#define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */
#define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */
#define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */
@@ -504,12 +501,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN)
__XFS_HAS_V4_FEAT(logv2, LOGV2)
__XFS_HAS_V4_FEAT(extflg, EXTFLG)
__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
-__XFS_ADD_V4_FEAT(attr2, ATTR2)
__XFS_ADD_V4_FEAT(projid32, PROJID32)
__XFS_HAS_V4_FEAT(v3inodes, V3INODES)
__XFS_HAS_V4_FEAT(crc, CRC)
__XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
+static inline void xfs_add_attr2(struct xfs_mount *mp)
+{
+ if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4))
+ xfs_sb_version_addattr2(&mp->m_sb);
+}
+
/*
* Mount features
*
@@ -517,7 +519,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
* bit inodes and read-only state, are kept as operational state rather than
* features.
*/
-__XFS_HAS_FEAT(noattr2, NOATTR2)
__XFS_HAS_FEAT(noalign, NOALIGN)
__XFS_HAS_FEAT(allocsize, ALLOCSIZE)
__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
@@ -526,7 +527,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC)
__XFS_HAS_FEAT(discard, DISCARD)
__XFS_HAS_FEAT(grpid, GRPID)
__XFS_HAS_FEAT(small_inums, SMALL_INUMS)
-__XFS_HAS_FEAT(ikeep, IKEEP)
__XFS_HAS_FEAT(swalloc, SWALLOC)
__XFS_HAS_FEAT(filestreams, FILESTREAMS)
__XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 866c71d9fbae..73b7e72944e4 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -293,7 +293,8 @@ int
xfs_mru_cache_init(void)
{
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
- XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+ 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index fbeddcac4792..b17672889942 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -165,7 +165,7 @@ xfs_dax_translate_range(
uint64_t *bblen)
{
u64 dev_start = btp->bt_dax_part_off;
- u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
+ u64 dev_len = BBTOB(btp->bt_nr_sectors);
u64 dev_end = dev_start + dev_len - 1;
/* Notify failure on the whole device. */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bb0a82635a77..e85a156dc17d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -105,8 +105,8 @@ enum {
Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
- Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
- Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
+ Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
+ Opt_largeio, Opt_nolargeio,
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
@@ -133,12 +133,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("norecovery", Opt_norecovery),
fsparam_flag("inode64", Opt_inode64),
fsparam_flag("inode32", Opt_inode32),
- fsparam_flag("ikeep", Opt_ikeep),
- fsparam_flag("noikeep", Opt_noikeep),
fsparam_flag("largeio", Opt_largeio),
fsparam_flag("nolargeio", Opt_nolargeio),
- fsparam_flag("attr2", Opt_attr2),
- fsparam_flag("noattr2", Opt_noattr2),
fsparam_flag("filestreams", Opt_filestreams),
fsparam_flag("quota", Opt_quota),
fsparam_flag("noquota", Opt_noquota),
@@ -175,13 +171,11 @@ xfs_fs_show_options(
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_FEAT_IKEEP, ",ikeep" },
{ XFS_FEAT_WSYNC, ",wsync" },
{ XFS_FEAT_NOALIGN, ",noalign" },
{ XFS_FEAT_SWALLOC, ",swalloc" },
{ XFS_FEAT_NOUUID, ",nouuid" },
{ XFS_FEAT_NORECOVERY, ",norecovery" },
- { XFS_FEAT_ATTR2, ",attr2" },
{ XFS_FEAT_FILESTREAMS, ",filestreams" },
{ XFS_FEAT_GRPID, ",grpid" },
{ XFS_FEAT_DISCARD, ",discard" },
@@ -541,7 +535,8 @@ xfs_setup_devices(
{
int error;
- error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+ error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize,
+ mp->m_sb.sb_dblocks);
if (error)
return error;
@@ -551,7 +546,7 @@ xfs_setup_devices(
if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_configure_buftarg(mp->m_logdev_targp,
- log_sector_size);
+ log_sector_size, mp->m_sb.sb_logblocks);
if (error)
return error;
}
@@ -565,7 +560,7 @@ xfs_setup_devices(
mp->m_rtdev_targp = mp->m_ddev_targp;
} else if (mp->m_rtname) {
error = xfs_configure_buftarg(mp->m_rtdev_targp,
- mp->m_sb.sb_sectsize);
+ mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks);
if (error)
return error;
}
@@ -578,19 +573,19 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_unwritten;
@@ -602,13 +597,14 @@ xfs_init_mount_workqueues(
goto out_destroy_reclaim;
mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
1, mp->m_super->s_id);
if (!mp->m_inodegc_wq)
goto out_destroy_blockgc;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
- XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0,
+ mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_inodegc;
@@ -778,7 +774,7 @@ xfs_fs_drop_inode(
return 0;
}
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
STATIC void
@@ -1088,15 +1084,6 @@ xfs_finish_flags(
}
/*
- * V5 filesystems always use attr2 format for attributes.
- */
- if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
- xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
- "attr2 is always enabled for V5 filesystems.");
- return -EINVAL;
- }
-
- /*
* prohibit r/w mounts of read-only filesystems
*/
if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
@@ -1542,22 +1529,6 @@ xfs_fs_parse_param(
return 0;
#endif
/* Following mount options will be removed in September 2025 */
- case Opt_ikeep:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
- parsing_mp->m_features |= XFS_FEAT_IKEEP;
- return 0;
- case Opt_noikeep:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
- parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
- return 0;
- case Opt_attr2:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
- parsing_mp->m_features |= XFS_FEAT_ATTR2;
- return 0;
- case Opt_noattr2:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
- parsing_mp->m_features |= XFS_FEAT_NOATTR2;
- return 0;
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
@@ -1593,16 +1564,6 @@ xfs_fs_validate_params(
return -EINVAL;
}
- /*
- * We have not read the superblock at this point, so only the attr2
- * mount option can set the attr2 feature by this stage.
- */
- if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
- xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
- return -EINVAL;
- }
-
-
if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
xfs_warn(mp,
"sunit and swidth options incompatible with the noalign option");
@@ -2177,21 +2138,6 @@ xfs_fs_reconfigure(
if (error)
return error;
- /* attr2 -> noattr2 */
- if (xfs_has_noattr2(new_mp)) {
- if (xfs_has_crc(mp)) {
- xfs_warn(mp,
- "attr2 is always enabled for a V5 filesystem - can't be changed.");
- return -EINVAL;
- }
- mp->m_features &= ~XFS_FEAT_ATTR2;
- mp->m_features |= XFS_FEAT_NOATTR2;
- } else if (xfs_has_attr2(new_mp)) {
- /* noattr2 -> attr2 */
- mp->m_features &= ~XFS_FEAT_NOATTR2;
- mp->m_features |= XFS_FEAT_ATTR2;
- }
-
/* Validate new max_atomic_write option before making other changes */
if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
error = xfs_set_max_atomic_write_opt(mp,
@@ -2596,8 +2542,8 @@ xfs_init_workqueues(void)
* AGs in all the filesystems mounted. Hence use the default large
* max_active value for this workqueue.
*/
- xfs_alloc_wq = alloc_workqueue("xfsalloc",
- XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
+ xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+ 0);
if (!xfs_alloc_wq)
return -ENOMEM;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 751dc74a3067..9918f14b4874 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
-STATIC int
+static inline int
xfs_deprecated_dointvec_minmax(
const struct ctl_table *ctl,
int write,
@@ -68,24 +68,6 @@ xfs_deprecated_dointvec_minmax(
static const struct ctl_table xfs_table[] = {
{
- .procname = "irix_sgid_inherit",
- .data = &xfs_params.sgid_inherit.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = xfs_deprecated_dointvec_minmax,
- .extra1 = &xfs_params.sgid_inherit.min,
- .extra2 = &xfs_params.sgid_inherit.max
- },
- {
- .procname = "irix_symlink_mode",
- .data = &xfs_params.symlink_mode.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = xfs_deprecated_dointvec_minmax,
- .extra1 = &xfs_params.symlink_mode.min,
- .extra2 = &xfs_params.symlink_mode.max
- },
- {
.procname = "panic_mask",
.data = &xfs_params.panic_mask.val,
.maxlen = sizeof(int),
@@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = {
.extra1 = &xfs_params.blockgc_timer.min,
.extra2 = &xfs_params.blockgc_timer.max,
},
- {
- .procname = "speculative_cow_prealloc_lifetime",
- .data = &xfs_params.blockgc_timer.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = xfs_deprecated_dointvec_minmax,
- .extra1 = &xfs_params.blockgc_timer.min,
- .extra2 = &xfs_params.blockgc_timer.max,
- },
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 51646f066c4f..ed9d896079c1 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val {
} xfs_sysctl_val_t;
typedef struct xfs_param {
- xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is
- * not a member of parent dir GID. */
- xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */
xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */
xfs_sysctl_val_t error_level; /* Degree of reporting for problems */
xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ac344e42846c..79b8641880ab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1152,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->count = atomic_read(&VFS_I(ip)->i_count);
+ __entry->count = icount_read(VFS_I(ip));
__entry->pincount = atomic_read(&ip->i_pincount);
__entry->iflags = ip->i_flags;
__entry->caller_ip = caller_ip;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 575e7028f423..474f5a04ec63 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -452,19 +452,17 @@ xfs_trans_mod_sb(
*/
STATIC void
xfs_trans_apply_sb_deltas(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
- struct xfs_dsb *sbp;
- struct xfs_buf *bp;
- int whole = 0;
-
- bp = xfs_trans_getsb(tp);
- sbp = bp->b_addr;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
+ struct xfs_dsb *sbp = bp->b_addr;
+ int whole = 0;
/*
* Only update the superblock counters if we are logging them
*/
- if (!xfs_has_lazysbcount((tp->t_mountp))) {
+ if (!xfs_has_lazysbcount(mp)) {
if (tp->t_icount_delta)
be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
if (tp->t_ifree_delta)
@@ -491,8 +489,7 @@ xfs_trans_apply_sb_deltas(
* write the correct value ondisk.
*/
if ((tp->t_frextents_delta || tp->t_res_frextents_delta) &&
- !xfs_has_rtgroups(tp->t_mountp)) {
- struct xfs_mount *mp = tp->t_mountp;
+ !xfs_has_rtgroups(mp)) {
int64_t rtxdelta;
rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
@@ -505,6 +502,8 @@ xfs_trans_apply_sb_deltas(
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+ mp->m_ddev_targp->bt_nr_sectors +=
+ XFS_FSB_TO_BB(mp, tp->t_dblocks_delta);
whole = 1;
}
if (tp->t_agcount_delta) {
@@ -524,7 +523,7 @@ xfs_trans_apply_sb_deltas(
* recompute the ondisk rtgroup block log. The incore values
* will be recomputed in xfs_trans_unreserve_and_mod_sb.
*/
- if (xfs_has_rtgroups(tp->t_mountp)) {
+ if (xfs_has_rtgroups(mp)) {
sbp->sb_rgblklog = xfs_compute_rgblklog(
be32_to_cpu(sbp->sb_rgextents),
be32_to_cpu(sbp->sb_rextsize));
@@ -537,6 +536,8 @@ xfs_trans_apply_sb_deltas(
}
if (tp->t_rblocks_delta) {
be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+ mp->m_rtdev_targp->bt_nr_sectors +=
+ XFS_FSB_TO_BB(mp, tp->t_rblocks_delta);
whole = 1;
}
if (tp->t_rextents_delta) {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 67c328d23e4a..38983c6777df 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -374,7 +374,7 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
/*
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index f28214c28ab5..1147bacb2da8 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -493,64 +493,58 @@ xfs_try_open_zone(
return oz;
}
+enum xfs_zone_alloc_score {
+ /* Any open zone will do it, we're desperate */
+ XFS_ZONE_ALLOC_ANY = 0,
+
+ /* It better fit somehow */
+ XFS_ZONE_ALLOC_OK = 1,
+
+ /* Only reuse a zone if it fits really well. */
+ XFS_ZONE_ALLOC_GOOD = 2,
+};
+
/*
- * For data with short or medium lifetime, try to colocated it into an
- * already open zone with a matching temperature.
+ * Life time hint co-location matrix. Fields not set default to 0
+ * aka XFS_ZONE_ALLOC_ANY.
*/
-static bool
-xfs_colocate_eagerly(
- enum rw_hint file_hint)
-{
- switch (file_hint) {
- case WRITE_LIFE_MEDIUM:
- case WRITE_LIFE_SHORT:
- case WRITE_LIFE_NONE:
- return true;
- default:
- return false;
- }
-}
-
-static bool
-xfs_good_hint_match(
- struct xfs_open_zone *oz,
- enum rw_hint file_hint)
-{
- switch (oz->oz_write_hint) {
- case WRITE_LIFE_LONG:
- case WRITE_LIFE_EXTREME:
- /* colocate long and extreme */
- if (file_hint == WRITE_LIFE_LONG ||
- file_hint == WRITE_LIFE_EXTREME)
- return true;
- break;
- case WRITE_LIFE_MEDIUM:
- /* colocate medium with medium */
- if (file_hint == WRITE_LIFE_MEDIUM)
- return true;
- break;
- case WRITE_LIFE_SHORT:
- case WRITE_LIFE_NONE:
- case WRITE_LIFE_NOT_SET:
- /* colocate short and none */
- if (file_hint <= WRITE_LIFE_SHORT)
- return true;
- break;
- }
- return false;
-}
+static const unsigned int
+xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = {
+ [WRITE_LIFE_NOT_SET] = {
+ [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_NONE] = {
+ [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_SHORT] = {
+ [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD,
+ },
+ [WRITE_LIFE_MEDIUM] = {
+ [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD,
+ },
+ [WRITE_LIFE_LONG] = {
+ [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
+ [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_EXTREME] = {
+ [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
+ [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
+ },
+};
static bool
xfs_try_use_zone(
struct xfs_zone_info *zi,
enum rw_hint file_hint,
struct xfs_open_zone *oz,
- bool lowspace)
+ unsigned int goodness)
{
if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
- if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+
+ if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness)
return false;
+
if (!atomic_inc_not_zero(&oz->oz_ref))
return false;
@@ -581,14 +575,14 @@ static struct xfs_open_zone *
xfs_select_open_zone_lru(
struct xfs_zone_info *zi,
enum rw_hint file_hint,
- bool lowspace)
+ unsigned int goodness)
{
struct xfs_open_zone *oz;
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+ if (xfs_try_use_zone(zi, file_hint, oz, goodness))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -651,9 +645,11 @@ xfs_select_zone_nowait(
* data.
*/
spin_lock(&zi->zi_open_zones_lock);
- if (xfs_colocate_eagerly(write_hint))
- oz = xfs_select_open_zone_lru(zi, write_hint, false);
- else if (pack_tight)
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD);
+ if (oz)
+ goto out_unlock;
+
+ if (pack_tight)
oz = xfs_select_open_zone_mru(zi, write_hint);
if (oz)
goto out_unlock;
@@ -667,16 +663,16 @@ xfs_select_zone_nowait(
goto out_unlock;
/*
- * Try to colocate cold data with other cold data if we failed to open a
- * new zone for it.
+ * Try to find an zone that is an ok match to colocate data with.
+ */
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * Pick the least recently used zone, regardless of hint match
*/
- if (write_hint != WRITE_LIFE_NOT_SET &&
- !xfs_colocate_eagerly(write_hint))
- oz = xfs_select_open_zone_lru(zi, write_hint, false);
- if (!oz)
- oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
- if (!oz)
- oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY);
out_unlock:
spin_unlock(&zi->zi_open_zones_lock);
return oz;
@@ -1135,7 +1131,7 @@ xfs_calc_open_zones(
if (bdev_open_zones)
mp->m_max_open_zones = bdev_open_zones;
else
- mp->m_max_open_zones = xfs_max_open_zones(mp);
+ mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES;
}
if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
@@ -1248,7 +1244,7 @@ xfs_mount_zones(
if (!mp->m_zone_info)
return -ENOMEM;
- xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+ xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
mp->m_max_open_zones);
trace_xfs_zones_mount(mp);