summaryrefslogtreecommitdiff
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c192
1 files changed, 182 insertions, 10 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30..fbc5a01260c8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -38,6 +38,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/memory-failure.h>
#include <linux/page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
@@ -50,7 +51,7 @@
#include <linux/backing-dev.h>
#include <linux/migrate.h>
#include <linux/slab.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
@@ -60,9 +61,12 @@
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/sysctl.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/memory-failure.h>
+
#include "swap.h"
#include "internal.h"
-#include "ras/ras_event.h"
static int sysctl_memory_failure_early_kill __read_mostly;
@@ -154,6 +158,10 @@ static const struct ctl_table memory_failure_table[] = {
}
};
+static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
+
+static DEFINE_MUTEX(pfn_space_lock);
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -688,10 +696,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
if (pte_present(pte)) {
pfn = pte_pfn(pte);
} else {
- swp_entry_t swp = pte_to_swp_entry(pte);
+ const softleaf_t entry = softleaf_from_pte(pte);
- if (is_hwpoison_entry(swp))
- pfn = swp_offset_pfn(swp);
+ if (softleaf_is_hwpoison(entry))
+ pfn = softleaf_to_pfn(entry);
}
if (!pfn || pfn != poisoned_pfn)
@@ -885,6 +893,7 @@ static const char * const action_page_types[] = {
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_ALREADY_POISONED] = "already poisoned page",
+ [MF_MSG_PFN_MAP] = "non struct page pfn",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1277,7 +1286,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- if (type != MF_MSG_ALREADY_POISONED) {
+ if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) {
num_poisoned_pages_inc(pfn);
update_per_node_mf_stats(pfn, result);
}
@@ -1653,12 +1662,13 @@ static int identify_page_state(unsigned long pfn, struct page *p,
* there is still more to do, hence the page refcount we took earlier
* is still needed.
*/
-static int try_to_split_thp_page(struct page *page, bool release)
+static int try_to_split_thp_page(struct page *page, unsigned int new_order,
+ bool release)
{
int ret;
lock_page(page);
- ret = split_huge_page(page);
+ ret = split_huge_page_to_order(page, new_order);
unlock_page(page);
if (ret && release)
@@ -2140,8 +2150,140 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
{
LIST_HEAD(tokill);
+ folio_lock(folio);
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
+ folio_unlock(folio);
+
+ kill_procs(&tokill, true, pfn, flags);
+}
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+ guard(mutex)(&pfn_space_lock);
+
+ if (interval_tree_iter_first(&pfn_space_itree,
+ pfn_space->node.start,
+ pfn_space->node.last))
+ return -EBUSY;
+
+ interval_tree_insert(&pfn_space->node, &pfn_space_itree);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_pfn_address_space);
+
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+ guard(mutex)(&pfn_space_lock);
+
+ if (interval_tree_iter_first(&pfn_space_itree,
+ pfn_space->node.start,
+ pfn_space->node.last))
+ interval_tree_remove(&pfn_space->node, &pfn_space_itree);
+}
+EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
+
+static void add_to_kill_pfn(struct task_struct *tsk,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill,
+ unsigned long pfn)
+{
+ struct to_kill *tk;
+
+ tk = kmalloc(sizeof(*tk), GFP_ATOMIC);
+ if (!tk) {
+ pr_info("Unable to kill proc %d\n", tsk->pid);
+ return;
+ }
+
+ /* Check for pgoff not backed by struct page */
+ tk->addr = vma_address(vma, pfn, 1);
+ tk->size_shift = PAGE_SHIFT;
+
+ if (tk->addr == -EFAULT)
+ pr_info("Unable to find address %lx in %s\n",
+ pfn, tsk->comm);
+
+ get_task_struct(tsk);
+ tk->tsk = tsk;
+ list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Collect processes when the error hit a PFN not backed by struct page.
+ */
+static void collect_procs_pfn(struct address_space *mapping,
+ unsigned long pfn, struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ rcu_read_lock();
+ for_each_process(tsk) {
+ struct task_struct *t = tsk;
+
+ t = task_early_kill(tsk, true);
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill_pfn(t, vma, to_kill, pfn);
+ }
+ }
+ rcu_read_unlock();
+ i_mmap_unlock_read(mapping);
+}
+
+/**
+ * memory_failure_pfn - Handle memory failure on a page not backed by
+ * struct page.
+ * @pfn: Page Number of the corrupted page
+ * @flags: fine tune action taken
+ *
+ * Return:
+ * 0 - success,
+ * -EBUSY - Page PFN does not belong to any address space mapping.
+ */
+static int memory_failure_pfn(unsigned long pfn, int flags)
+{
+ struct interval_tree_node *node;
+ LIST_HEAD(tokill);
+
+ scoped_guard(mutex, &pfn_space_lock) {
+ bool mf_handled = false;
+
+ /*
+ * Modules registers with MM the address space mapping to
+ * the device memory they manage. Iterate to identify
+ * exactly which address space has mapped to this failing
+ * PFN.
+ */
+ for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
+ node = interval_tree_iter_next(node, pfn, pfn)) {
+ struct pfn_address_space *pfn_space =
+ container_of(node, struct pfn_address_space, node);
+
+ collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
+
+ mf_handled = true;
+ }
+
+ if (!mf_handled)
+ return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED);
+ }
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a different
+ * physical page at a given virtual address, so all userspace
+ * consumption of direct PFN memory necessitates SIGBUS (i.e.
+ * MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
kill_procs(&tokill, true, pfn, flags);
+
+ return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
}
/**
@@ -2193,6 +2335,14 @@ int memory_failure(unsigned long pfn, int flags)
if (res == 0)
goto unlock_mutex;
+ if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
+ /*
+ * The PFN is not backed by struct page.
+ */
+ res = memory_failure_pfn(pfn, flags);
+ goto unlock_mutex;
+ }
+
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn);
put_ref_page(pfn, flags);
@@ -2274,6 +2424,9 @@ try_again:
folio_unlock(folio);
if (folio_test_large(folio)) {
+ const int new_order = min_order_for_split(folio);
+ int err;
+
/*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
@@ -2288,7 +2441,16 @@ try_again:
* page is a valid handlable page.
*/
folio_set_has_hwpoisoned(folio);
- if (try_to_split_thp_page(p, false) < 0) {
+ err = try_to_split_thp_page(p, new_order, /* release= */ false);
+ /*
+ * If splitting a folio to order-0 fails, kill the process.
+ * Split the folio regardless to minimize unusable pages.
+ * Because the memory failure code cannot handle large
+ * folios, this split is always treated as if it failed.
+ */
+ if (err || new_order) {
+ /* get folio again in case the original one is split */
+ folio = page_folio(p);
res = -EHWPOISON;
kill_procs_now(p, pfn, flags, folio);
put_page(p);
@@ -2615,7 +2777,17 @@ static int soft_offline_in_use_page(struct page *page)
};
if (!huge && folio_test_large(folio)) {
- if (try_to_split_thp_page(page, true)) {
+ const int new_order = min_order_for_split(folio);
+
+ /*
+ * If new_order (target split order) is not 0, do not split the
+ * folio at all to retain the still accessible large folio.
+ * NOTE: if minimizing the number of soft offline pages is
+ * preferred, split it to non-zero new_order like it is done in
+ * memory_failure().
+ */
+ if (new_order || try_to_split_thp_page(page, /* new_order= */ 0,
+ /* release= */ true)) {
pr_info("%#lx: thp split failed\n", pfn);
return -EBUSY;
}