From 00501b531c4723972aa11d6d4ebcf8d6552007c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 8 Aug 2014 14:19:20 -0700 Subject: mm: memcontrol: rewrite charge API These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Hugh Dickins Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); -- cgit v1.2.3 From 747db954cab64c6b7a95b121b517165f34751898 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 8 Aug 2014 14:19:24 -0700 Subject: mm: memcontrol: use page lists for uncharge batching Pages are now uncharged at release time, and all sources of batched uncharges operate on lists of pages. Directly use those lists, and get rid of the per-task batching state. This also batches statistics accounting, in addition to the res counter charges, to reduce IRQ-disabling and re-enabling. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Cc: Naoya Horiguchi Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 +-- include/linux/sched.h | 6 -- kernel/fork.c | 4 - mm/memcontrol.c | 206 ++++++++++++++++++++++++--------------------- mm/swap.c | 6 +- mm/vmscan.c | 12 ++- 6 files changed, 117 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 806b8fa15c5f..e0752d204d9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -59,12 +59,8 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); - void mem_cgroup_uncharge(struct page *page); - -/* Batched uncharging */ -void mem_cgroup_uncharge_start(void); -void mem_cgroup_uncharge_end(void); +void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare); @@ -233,11 +229,7 @@ static inline void mem_cgroup_uncharge(struct page *page) { } -static inline void mem_cgroup_uncharge_start(void) -{ -} - -static inline void mem_cgroup_uncharge_end(void) +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 7c19d552dc3f..4fcf82a4d243 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1628,12 +1628,6 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { struct mem_cgroup *memcg; diff --git a/kernel/fork.c b/kernel/fork.c index fbd3497b221f..f6f5086c9e7d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1346,10 +1346,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9106f1b12f56..a6e2be0241af 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3581,53 +3581,6 @@ out: return ret; } -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ - unsigned long flags; - - local_irq_save(flags); - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } - local_irq_restore(flags); -} - -void mem_cgroup_uncharge_end(void) -{ - struct memcg_batch_info *batch = ¤t->memcg_batch; - unsigned long flags; - - local_irq_save(flags); - VM_BUG_ON(!batch->do_batch); - if (--batch->do_batch) /* If stacked, do nothing */ - goto out; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); -out: - local_irq_restore(flags); -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -6554,6 +6507,98 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) cancel_charge(memcg, nr_pages); } +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) +{ + unsigned long flags; + + if (nr_mem) + res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); + + memcg_oom_recover(memcg); + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); +} + +static void uncharge_list(struct list_head *page_list) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); +} + /** * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge @@ -6563,67 +6608,34 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) */ void mem_cgroup_uncharge(struct page *page) { - struct memcg_batch_info *batch; - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; struct page_cgroup *pc; - unsigned long pc_flags; - unsigned long flags; - - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); if (mem_cgroup_disabled()) return; + /* Don't touch page->lru of any random page, pre-check: */ pc = lookup_page_cgroup(page); - - /* Every final put_page() ends up here */ if (!PageCgroupUsed(pc)) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point, we have fully - * exclusive access to the page. - */ - memcg = pc->mem_cgroup; - pc_flags = pc->flags; - pc->flags = 0; - - local_irq_save(flags); + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); +} - if (nr_pages > 1) - goto direct; - if (unlikely(test_thread_flag(TIF_MEMDIE))) - goto direct; - batch = ¤t->memcg_batch; - if (!batch->do_batch) - goto direct; - if (batch->memcg && batch->memcg != memcg) - goto direct; - if (!batch->memcg) - batch->memcg = memcg; - if (pc_flags & PCG_MEM) - batch->nr_pages++; - if (pc_flags & PCG_MEMSW) - batch->memsw_nr_pages++; - goto out; -direct: - if (pc_flags & PCG_MEM) - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (pc_flags & PCG_MEMSW) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - memcg_oom_recover(memcg); -out: - mem_cgroup_charge_statistics(memcg, page, -nr_pages); - memcg_check_events(memcg, page); +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; - local_irq_restore(flags); + if (!list_empty(page_list)) + uncharge_list(page_list); } /** diff --git a/mm/swap.c b/mm/swap.c index 00523fffa5ed..6b2dc3897cd5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -908,8 +908,6 @@ void release_pages(struct page **pages, int nr, bool cold) struct lruvec *lruvec; unsigned long uninitialized_var(flags); - mem_cgroup_uncharge_start(); - for (i = 0; i < nr; i++) { struct page *page = pages[i]; @@ -941,7 +939,6 @@ void release_pages(struct page **pages, int nr, bool cold) __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); } - mem_cgroup_uncharge(page); /* Clear Active bit in case of parallel mark_page_accessed */ __ClearPageActive(page); @@ -951,8 +948,7 @@ void release_pages(struct page **pages, int nr, bool cold) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); - mem_cgroup_uncharge_end(); - + mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7068e838d22b..2836b5373b2e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -822,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1103,7 +1102,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ __clear_page_locked(page); free_it: - mem_cgroup_uncharge(page); nr_reclaimed++; /* @@ -1133,8 +1131,8 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - mem_cgroup_uncharge_end(); + mem_cgroup_uncharge_list(&free_pages); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -1437,10 +1435,9 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) __ClearPageActive(page); del_page_from_lru_list(page, lruvec, lru); - mem_cgroup_uncharge(page); - if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1548,6 +1545,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); /* @@ -1660,10 +1658,9 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, __ClearPageActive(page); del_page_from_lru_list(page, lruvec, lru); - mem_cgroup_uncharge(page); - if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1771,6 +1768,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); } -- cgit v1.2.3 From 9a3f4d85d58cb4e02e226f9be946d54c33eb715b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:19:28 -0700 Subject: page-cgroup: get rid of NR_PCG_FLAGS It's not used anywhere today, so let's remove it. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 6 ------ kernel/bounds.c | 2 -- 2 files changed, 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b8f8c9e36a3e..9d9f540658f5 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -6,12 +6,8 @@ enum { PCG_USED = 0x01, /* This page is charged to a memcg */ PCG_MEM = 0x02, /* This page holds a memory charge */ PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */ - __NR_PCG_FLAGS, }; -#ifndef __GENERATING_BOUNDS_H -#include - struct pglist_data; #ifdef CONFIG_MEMCG @@ -107,6 +103,4 @@ static inline void swap_cgroup_swapoff(int type) #endif /* CONFIG_MEMCG_SWAP */ -#endif /* !__GENERATING_BOUNDS_H */ - #endif /* __LINUX_PAGE_CGROUP_H */ diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif -- cgit v1.2.3 From b86280aa48b67c8119ed8f6c6bebd8c0af13a269 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 8 Aug 2014 14:19:41 -0700 Subject: kernel/kallsyms.c: fix %pB when there's no symbol at the address __sprint_symbol() should restore original address when kallsyms_lookup() failed to find a symbol. It's reported when dumpstack shows an address in a dynamically allocated trampoline for ftrace. [ 1314.612287] [] dump_stack+0x45/0x56 [ 1314.612290] [] ? meminfo_proc_open+0x30/0x30 [ 1314.612293] [] kpatch_ftrace_handler+0x14/0xf0 [kpatch] [ 1314.612306] [] 0xffffffffa00160c3 You can see a difference in the hex address - c4 and c3. Fix it. Signed-off-by: Namhyung Kim Reported-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..ae5167087845 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); -- cgit v1.2.3 From 4878b14b43188ffeceecfc32295ed2a783b7aa7a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:19:48 -0700 Subject: kernel/test_kprobes.c: use current logging functions - Add pr_fmt - Coalesce formats - Use current pr_foo() functions instead of printk - Remove unnecessary "failed" display (already in log level). Signed-off-by: Fabian Frederick Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 87 ++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include #include #include @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } -- cgit v1.2.3 From a0be55dee71d437f7593c8c3673edd92962bafaf Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Fri, 8 Aug 2014 14:21:18 -0700 Subject: kernel/exit.c: fix coding style warnings and errors Fixed coding style warnings and errors. Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 88c6b3e42583..32c58f7433a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include #include -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -566,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -599,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -649,9 +653,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -692,8 +695,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -717,9 +719,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -837,7 +839,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -847,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -870,6 +870,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1034,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1418,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1431,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } -- cgit v1.2.3 From ccf94f1b4a8560ffdc221840535bae5e5a91a53c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:21:22 -0700 Subject: proc: constify seq_operations proc_uid_seq_operations, proc_gid_seq_operations and proc_projid_seq_operations are only called in proc_id_map_open with seq_open as const struct seq_operations so we can constify the 3 structures and update proc_id_map_open prototype. text data bss dec hex filename 6817 404 1984 9205 23f5 kernel/user_namespace.o-before 6913 308 1984 9205 23f5 kernel/user_namespace.o-after Signed-off-by: Fabian Frederick Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- include/linux/user_namespace.h | 6 +++--- kernel/user_namespace.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 2d696b0c93bf..79df9ff71afd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2449,7 +2449,7 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4836ba3c1cd8..e95372654f09 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -57,9 +57,9 @@ static inline void put_user_ns(struct user_namespace *ns) } struct seq_operations; -extern struct seq_operations proc_uid_seq_operations; -extern struct seq_operations proc_gid_seq_operations; -extern struct seq_operations proc_projid_seq_operations; +extern const struct seq_operations proc_uid_seq_operations; +extern const struct seq_operations proc_gid_seq_operations; +extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, -- cgit v1.2.3 From 41f727fde1fe40efeb4fef6fdce74ff794be5aeb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:21:56 -0700 Subject: fork/exec: cleanup mm initialization mm initialization on fork/exec is spread all over the place, which makes the code look inconsistent. We have mm_init(), which is supposed to init/nullify mm's internals, but it doesn't init all the fields it should: - on fork ->mmap,mm_rb,vmacache_seqnum,map_count,mm_cpumask,locked_vm are zeroed in dup_mmap(); - on fork ->pmd_huge_pte is zeroed in dup_mm(), immediately before calling mm_init(); - ->cpu_vm_mask_var ptr is initialized by mm_init_cpumask(), which is called before mm_init() on both fork and exec; - ->context is initialized by init_new_context(), which is called after mm_init() on both fork and exec; Let's consolidate all the initializations in mm_init() to make the code look cleaner. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ---- include/linux/mm_types.h | 1 + kernel/fork.c | 47 ++++++++++++++++++++--------------------------- 3 files changed, 21 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index ab1f1200ce5d..a2b42a98c743 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; - err = init_new_context(current, mm); - if (err) - goto err; - err = __bprm_mm_init(bprm); if (err) goto err; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 796deac19fcf..6e0b286649f1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -461,6 +461,7 @@ static inline void mm_init_cpumask(struct mm_struct *mm) #ifdef CONFIG_CPUMASK_OFFSTACK mm->cpu_vm_mask_var = &mm->cpumask_allocation; #endif + cpumask_clear(mm->cpu_vm_mask_var); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/kernel/fork.c b/kernel/fork.c index f6f5086c9e7d..418b52a9ec6a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,12 +374,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -538,17 +532,27 @@ static void mm_init_aio(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -558,11 +562,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -596,7 +606,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -828,17 +837,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -860,15 +862,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) -- cgit v1.2.3 From ce65cefa5debefc0e81d0a533bda467f0aa67350 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:21:58 -0700 Subject: fork: reset mm->pinned_vm mm->pinned_vm counts pages of mm's address space that were permanently pinned in memory by increasing their reference counter. The counter was introduced by commit bc3e53f682d9 ("mm: distinguish between mlocked and pinned pages"), while before it locked_vm had been used for such pages. Obviously, we should reset the counter on fork if !CLONE_VM, just like we do with locked_vm, but currently we don't. Let's fix it. This patch will fix the contents of /proc/pid/status:VmPin. ib_umem_get[infiniband] and perf_mmap still check pinned_vm against RLIMIT_MEMLOCK. It's left from the times when pinned pages were accounted under locked_vm, but today it looks wrong. It isn't clear how we should deal with it. We still have some drivers accounting pinned pages under mm->locked_vm - this is what commit bc3e53f682d9 was fighting against. It's infiniband/usnic and vfio. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Cc: Roland Dreier Cc: Sean Hefty Cc: Hal Rosenstock Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 418b52a9ec6a..5a547a59a38a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -543,6 +543,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_long_set(&mm->nr_ptes, 0); mm->map_count = 0; mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_cpumask(mm); -- cgit v1.2.3 From 4f7d461433bb4a4deee61baefdac6cd1a1ecb546 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:22:01 -0700 Subject: fork: copy mm's vm usage counters under mmap_sem If a forking process has a thread calling (un)mmap (silly but still), the child process may have some of its mm's vm usage counters (total_vm and friends) screwed up, because currently they are copied from oldmm w/o holding any locks (memcpy in dup_mm). This patch moves the counters initialization to dup_mmap() to be called under oldmm->mmap_sem, which eliminates any possibility of race. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5a547a59a38a..aff84f84b0d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,6 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; -- cgit v1.2.3 From 33144e8429bd7fceacbb869a7f5061db42e13fe6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:22:03 -0700 Subject: kernel/fork.c: make mm_init_owner static It's only used in fork.c:mm_init(). Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ----- kernel/fork.c | 14 +++++++------- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4fcf82a4d243..b21e9218c0fd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2961,15 +2961,10 @@ static inline void inc_syscw(struct task_struct *tsk) #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); -extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } - -static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ -} #endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, diff --git a/kernel/fork.c b/kernel/fork.c index aff84f84b0d3..86da59e165ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -535,6 +535,13 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { mm->mmap = NULL; @@ -1139,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ -- cgit v1.2.3 From 834b18b23e1012e6c2987af703490bc60956d211 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:22:20 -0700 Subject: kernel/gcov/fs.c: remove unnecessary null test before debugfs_remove This fixes checkpatch warning: WARNING: debugfs_remove(NULL) is safe this check is probably not required Signed-off-by: Fabian Frederick Cc: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } -- cgit v1.2.3 From 69361eef9056b0babb507798c2135ad1572f0ef7 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Fri, 8 Aug 2014 14:22:31 -0700 Subject: panic: add TAINT_SOFTLOCKUP This taint flag will be set if the system has ever entered a softlockup state. Similar to TAINT_WARN it is useful to know whether or not the system has been in a softlockup state when debugging. [akpm@linux-foundation.org: apply the taint before calling panic()] Signed-off-by: Josh Hunt Cc: Jason Baron Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/oops-tracing.txt | 2 ++ Documentation/sysctl/kernel.txt | 1 + include/linux/kernel.h | 1 + kernel/panic.c | 1 + kernel/watchdog.c | 1 + 5 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index e3155995ddd8..beefb9f82902 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt @@ -268,6 +268,8 @@ characters, each representing a particular tainted value. 14: 'E' if an unsigned module has been loaded in a kernel supporting module signature. + 15: 'L' if a soft lockup has previously occurred on the system. + The primary reason for the 'Tainted: ' string is to tell kernel debuggers if this is a clean kernel or if anything unusual has occurred. Tainting is permanent: even if an offending module is diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c14374e71775..f79eb9666379 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -826,6 +826,7 @@ can be ORed together: 4096 - An out-of-tree module has been loaded. 8192 - An unsigned module has been loaded in a kernel supporting module signature. +16384 - A soft lockup has previously occurred on the system. ============================================================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3dc22abbc68a..31ae66f34235 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -470,6 +470,7 @@ extern enum system_states { #define TAINT_FIRMWARE_WORKAROUND 11 #define TAINT_OOT_MODULE 12 #define TAINT_UNSIGNED_MODULE 13 +#define TAINT_SOFTLOCKUP 14 extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..d09dc5c32c67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51b29e9d2ba6..a8d6914030fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -368,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); -- cgit v1.2.3 From ab602f799159393143d567e5c04b936fec79d6bd Mon Sep 17 00:00:00 2001 From: Jack Miller Date: Fri, 8 Aug 2014 14:23:19 -0700 Subject: shm: make exit_shm work proportional to task activity This is small set of patches our team has had kicking around for a few versions internally that fixes tasks getting hung on shm_exit when there are many threads hammering it at once. Anton wrote a simple test to cause the issue: http://ozlabs.org/~anton/junkcode/bust_shm_exit.c Before applying this patchset, this test code will cause either hanging tracebacks or pthread out of memory errors. After this patchset, it will still produce output like: root@somehost:~# ./bust_shm_exit 1024 160 ... INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113) INFO: Stall ended before state dump start ... But the task will continue to run along happily, so we consider this an improvement over hanging, even if it's a bit noisy. This patch (of 3): exit_shm obtains the ipc_ns shm rwsem for write and holds it while it walks every shared memory segment in the namespace. Thus the amount of work is related to the number of shm segments in the namespace not the number of segments that might need to be cleaned. In addition, this occurs after the task has been notified the thread has exited, so the number of tasks waiting for the ns shm rwsem can grow without bound until memory is exausted. Add a list to the task struct of all shmids allocated by this task. Init the list head in copy_process. Use the ns->rwsem for locking. Add segments after id is added, remove before removing from id. On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar to handling of semaphore undo. I chose a define for the init sequence since its a simple list init, otherwise it would require a function call to avoid include loops between the semaphore code and the task struct. Converting the list_del to list_del_init for the unshare cases would remove the exit followed by init, but I left it blow up if not inited. Signed-off-by: Milton Miller Signed-off-by: Jack Miller Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ include/linux/shm.h | 16 +++++++++++++++- ipc/shm.c | 22 +++++++++++----------- kernel/fork.c | 6 ++++++ 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b21e9218c0fd..db2f6474e95e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -33,6 +33,7 @@ struct sched_param { #include #include +#include #include #include #include @@ -1385,6 +1386,7 @@ struct task_struct { #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ diff --git a/include/linux/shm.h b/include/linux/shm.h index 57d77709fbe2..fd206387048a 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -1,6 +1,7 @@ #ifndef _LINUX_SHM_H_ #define _LINUX_SHM_H_ +#include #include #include #include @@ -20,6 +21,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; + struct list_head shm_clist; /* list by creator */ }; /* shm_mode upper byte flags */ @@ -44,11 +46,20 @@ struct shmid_kernel /* private to the kernel */ #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) #ifdef CONFIG_SYSVIPC +struct sysv_shm { + struct list_head shm_clist; +}; + long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); extern int is_file_shm_hugepages(struct file *file); -extern void exit_shm(struct task_struct *task); +void exit_shm(struct task_struct *task); +#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else +struct sysv_shm { + /* empty */ +}; + static inline long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba) @@ -62,6 +73,9 @@ static inline int is_file_shm_hugepages(struct file *file) static inline void exit_shm(struct task_struct *task) { } +static inline void shm_init_task(struct task_struct *task) +{ +} #endif #endif /* _LINUX_SHM_H_ */ diff --git a/ipc/shm.c b/ipc/shm.c index 89fc354156cb..1fc3a61b443b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -178,6 +178,7 @@ static void shm_rcu_free(struct rcu_head *head) static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { + list_del(&s->shm_clist); ipc_rmid(&shm_ids(ns), &s->shm_perm); } @@ -268,14 +269,10 @@ static void shm_close(struct vm_area_struct *vma) } /* Called with ns->shm_ids(ns).rwsem locked */ -static int shm_try_destroy_current(int id, void *p, void *data) +static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns) { - struct ipc_namespace *ns = data; - struct kern_ipc_perm *ipcp = p; - struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); - - if (shp->shm_creator != current) - return 0; + if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */ + return; /* * Mark it as orphaned to destroy the segment when @@ -289,13 +286,12 @@ static int shm_try_destroy_current(int id, void *p, void *data) * is not set, it shouldn't be deleted here. */ if (!ns->shm_rmid_forced) - return 0; + return; if (shm_may_destroy(ns, shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } - return 0; } /* Called with ns->shm_ids(ns).rwsem locked */ @@ -333,14 +329,17 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) void exit_shm(struct task_struct *task) { struct ipc_namespace *ns = task->nsproxy->ipc_ns; + struct shmid_kernel *shp, *n; if (shm_ids(ns).in_use == 0) return; /* Destroy all already created segments, but not mapped yet */ down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) - idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); + list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) + shm_mark_orphan(shp, ns); + /* remove the list head from any segments still attached */ + list_del(&task->sysvshm.shm_clist); up_write(&shm_ids(ns).rwsem); } @@ -561,6 +560,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); /* * shmid gets reported as "inode#" in /proc/pid/maps. diff --git a/kernel/fork.c b/kernel/fork.c index 86da59e165ad..fa9124322cd4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1362,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1913,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); -- cgit v1.2.3 From 934fc295b30ea8ce5d5e0ab9024a10fab9b6f200 Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Fri, 8 Aug 2014 14:23:42 -0700 Subject: kernel/acct.c: fix coding style warnings and errors Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a1844f14c6d6..51793520566f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -141,12 +141,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) if (acct->active) { if (act < 0) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { if (act > 0) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -261,6 +261,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); error = acct_on(tmp); @@ -376,7 +377,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +390,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -486,16 +488,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_etime = encode_float(elapsed); #else ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; ac.ac_etime_lo = (u16) etime; } @@ -505,15 +508,15 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2 ac.ac_ahz = AHZ; #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -574,6 +577,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { -- cgit v1.2.3 From 4bb5f5d9395bc112d93a134d8f5b05611eddc9c0 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:25 -0700 Subject: mm: allow drivers to prevent new writable mappings This patch (of 6): The i_mmap_writable field counts existing writable mappings of an address_space. To allow drivers to prevent new writable mappings, make this counter signed and prevent new writable mappings if it is negative. This is modelled after i_writecount and DENYWRITE. This will be required by the shmem-sealing infrastructure to prevent any new writable mappings after the WRITE seal has been set. In case there exists a writable mapping, this operation will fail with EBUSY. Note that we rely on the fact that iff you already own a writable mapping, you can increase the counter without using the helpers. This is the same that we do for i_writecount. Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + include/linux/fs.h | 29 +++++++++++++++++++++++++++-- kernel/fork.c | 2 +- mm/mmap.c | 30 ++++++++++++++++++++++++------ mm/swap_state.c | 1 + 5 files changed, 54 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/inode.c b/fs/inode.c index 5938f3928944..26753ba7b6d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ab6c6913040..f0890e4a7c25 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -387,7 +387,7 @@ struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ - unsigned int i_mmap_writable;/* count VM_SHARED mappings */ + atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ @@ -470,10 +470,35 @@ static inline int mapping_mapped(struct address_space *mapping) * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. + * + * If i_mmap_writable is negative, no new writable mappings are allowed. You + * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { - return mapping->i_mmap_writable != 0; + return atomic_read(&mapping->i_mmap_writable) > 0; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; +} + +static inline void mapping_unmap_writable(struct address_space *mapping) +{ + atomic_dec(&mapping->i_mmap_writable); +} + +static inline int mapping_deny_writable(struct address_space *mapping) +{ + return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? + 0 : -EBUSY; +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); } /* diff --git a/kernel/fork.c b/kernel/fork.c index fa9124322cd4..1380d8ace334 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -429,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) diff --git a/mm/mmap.c b/mm/mmap.c index 64c9d736155c..c1f2ea4a0b99 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -221,7 +221,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; + mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -622,7 +622,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -1577,6 +1577,17 @@ munmap_back: if (error) goto free_vma; } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) @@ -1616,8 +1627,12 @@ munmap_back: vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } file = vma->vm_file; out: perf_event_mmap(vma); @@ -1646,14 +1661,17 @@ out: return addr; unmap_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: diff --git a/mm/swap_state.c b/mm/swap_state.c index e160151da6b8..3e0ec83d000c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,6 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, } -- cgit v1.2.3 From 9183df25fe7b194563db3fec6dc3202a5855839c Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:29 -0700 Subject: shm: add memfd_create() syscall memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor that you can pass to mmap(). It can support sealing and avoids any connection to user-visible mount-points. Thus, it's not subject to quotas on mounted file-systems, but can be used like malloc()'ed memory, but with a file-descriptor to it. memfd_create() returns the raw shmem file, so calls like ftruncate() can be used to modify the underlying inode. Also calls like fstat() will return proper information and mark the file as regular file. If you want sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not supported (like on all other regular files). Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not subject to a filesystem size limit. It is still properly accounted to memcg limits, though, and to the same overcommit or no-overcommit accounting as all user memory. Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 1 + include/uapi/linux/memfd.h | 8 +++++ kernel/sys_ni.c | 1 + mm/shmem.c | 73 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+) create mode 100644 include/uapi/linux/memfd.h (limited to 'kernel') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index d1b4a119d4a5..028b78168d85 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -362,3 +362,4 @@ 353 i386 renameat2 sys_renameat2 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 252c804bb1aa..ca2b9aa78c81 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -325,6 +325,7 @@ 316 common renameat2 sys_renameat2 317 common seccomp sys_seccomp 318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 701daff5d899..15a069425cbf 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -802,6 +802,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); +asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h new file mode 100644 index 000000000000..534e364bda92 --- /dev/null +++ b/include/uapi/linux/memfd.h @@ -0,0 +1,8 @@ +#ifndef _UAPI_LINUX_MEMFD_H +#define _UAPI_LINUX_MEMFD_H + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U + +#endif /* _UAPI_LINUX_MEMFD_H */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2904a2105914..1f79e3714533 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/mm/shmem.c b/mm/shmem.c index 8b43bb7a4efe..4a5498795a2b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include +#include #include #include @@ -2732,6 +2734,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) shmem_show_mpol(seq, sbinfo->mpol); return 0; } + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) -- cgit v1.2.3 From 8370edea81e321b8a976969753d6b2811e6d5ed6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:38 -0700 Subject: bin2c: move bin2c in scripts/basic This patch series does not do kernel signature verification yet. I plan to post another patch series for that. Now distributions are already signing PE/COFF bzImage with PKCS7 signature I plan to parse and verify those signatures. Primary goal of this patchset is to prepare groundwork so that kernel image can be signed and signatures be verified during kexec load. This should help with two things. - It should allow kexec/kdump on secureboot enabled machines. - In general it can help even without secureboot. By being able to verify kernel image signature in kexec, it should help with avoiding module signing restrictions. Matthew Garret showed how to boot into a custom kernel, modify first kernel's memory and then jump back to old kernel and bypass any policy one wants to. This patch (of 15): Kexec wants to use bin2c and it wants to use it really early in the build process. See arch/x86/purgatory/ code in later patches. So move bin2c in scripts/basic so that it can be built very early and be usable by arch/x86/purgatory/ Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- scripts/.gitignore | 1 - scripts/Makefile | 1 - scripts/basic/.gitignore | 1 + scripts/basic/Makefile | 1 + scripts/basic/bin2c.c | 35 +++++++++++++++++++++++++++++++++++ scripts/bin2c.c | 36 ------------------------------------ 7 files changed, 38 insertions(+), 39 deletions(-) create mode 100644 scripts/basic/bin2c.c delete mode 100644 scripts/bin2c.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0026cf531769..dc5c77544fd6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/scripts/.gitignore b/scripts/.gitignore index fb070fa1038f..5ecfe93f2028 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -4,7 +4,6 @@ conmakehash kallsyms pnmtologo -bin2c unifdef ihex2fw recordmcount diff --git a/scripts/Makefile b/scripts/Makefile index 890df5c6adfb..72902b5f2721 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -13,7 +13,6 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash -hostprogs-$(CONFIG_IKCONFIG) += bin2c hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable hostprogs-$(CONFIG_ASN1) += asn1_compiler diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index a776371a3502..9528ec9e5adc 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1 +1,2 @@ fixdep +bin2c diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 4fcef87bb875..afbc1cd69ac5 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,6 +9,7 @@ # fixdep: Used to generate dependency information during build process hostprogs-y := fixdep +hostprogs-$(CONFIG_IKCONFIG) += bin2c always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/basic/bin2c.c b/scripts/basic/bin2c.c new file mode 100644 index 000000000000..af187e695345 --- /dev/null +++ b/scripts/basic/bin2c.c @@ -0,0 +1,35 @@ +/* + * Unloved program to convert a binary on stdin to a C include on stdout + * + * Jan 1999 Matt Mackall + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include + +int main(int argc, char *argv[]) +{ + int ch, total = 0; + + if (argc > 1) + printf("const char %s[] %s=\n", + argv[1], argc > 2 ? argv[2] : ""); + + do { + printf("\t\""); + while ((ch = getchar()) != EOF) { + total++; + printf("\\x%02x", ch); + if (total % 16 == 0) + break; + } + printf("\"\n"); + } while (ch != EOF); + + if (argc > 1) + printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total); + + return 0; +} diff --git a/scripts/bin2c.c b/scripts/bin2c.c deleted file mode 100644 index 96dd2bcbb407..000000000000 --- a/scripts/bin2c.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Unloved program to convert a binary on stdin to a C include on stdout - * - * Jan 1999 Matt Mackall - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - */ - -#include - -int main(int argc, char *argv[]) -{ - int ch, total=0; - - if (argc > 1) - printf("const char %s[] %s=\n", - argv[1], argc > 2 ? argv[2] : ""); - - do { - printf("\t\""); - while ((ch = getchar()) != EOF) - { - total++; - printf("\\x%02x",ch); - if (total % 16 == 0) - break; - } - printf("\"\n"); - } while (ch != EOF); - - if (argc > 1) - printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total); - - return 0; -} -- cgit v1.2.3 From 7d3e2bca22feb1f4a624009ff6c15e6f724cb4e7 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:43 -0700 Subject: kexec: rename unusebale_pages to unusable_pages Let's use the more common "unusable". This patch was originally written and posted by Boris. I am including it in this patch series. Signed-off-by: Borislav Petkov Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 +- kernel/kexec.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index a75641930049..d9bb0a57d208 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -100,7 +100,7 @@ struct kimage { struct list_head control_pages; struct list_head dest_pages; - struct list_head unuseable_pages; + struct list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ unsigned long control_page; diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..c7cc2a00181c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -154,7 +154,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); + INIT_LIST_HEAD(&image->unusable_pages); /* Read in the segments */ image->nr_segments = nr_segments; @@ -609,7 +609,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -732,7 +732,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; -- cgit v1.2.3 From dabe78628dd886c4b71971d1d78f1cecc674b760 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:45 -0700 Subject: kexec: move segment verification code in a separate function Previously do_kimage_alloc() will allocate a kimage structure, copy segment list from user space and then do the segment list sanity verification. Break down this function in 3 parts. do_kimage_alloc_init() to do actual allocation and basic initialization of kimage structure. copy_user_segment_list() to copy segment list from user space and sanity_check_segment_list() to verify the sanity of segment list as passed by user space. In later patches, I need to only allocate kimage and not copy segment list from user space. So breaking down in smaller functions enables re-use of code at other places. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 182 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c7cc2a00181c..062e5567750e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,45 +125,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +167,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +190,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,18 +202,61 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); @@ -244,10 +269,19 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, struct kimage *image; /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + result = copy_user_segment_list(image, nr_segments, segments); if (result) - goto out; + goto out_free_image; + + result = sanity_check_segment_list(image); + if (result) + goto out_free_image; /* * Find a location for the control code buffer, and add it @@ -259,22 +293,21 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { pr_err("Could not allocate swap buffer\n"); - goto out_free; + goto out_free_control_pages; } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: return result; } @@ -284,19 +317,17 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, { int result; struct kimage *image; - unsigned long i; - image = NULL; /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; - goto out; - } + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; /* Enable the special crash kernel control page * allocation policy. @@ -304,25 +335,13 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + result = copy_user_segment_list(image, nr_segments, segments); + if (result) + goto out_free_image; - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; - } + result = sanity_check_segment_list(image); + if (result) + goto out_free_image; /* * Find a location for the control code buffer, and add @@ -334,15 +353,14 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } *rimage = image; return 0; -out_free: +out_free_image: kfree(image); -out: return result; } -- cgit v1.2.3 From 255aedd90e3e804fb52e1a71636a3b22cf12f81b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:48 -0700 Subject: kexec: use common function for kimage_normal_alloc() and kimage_crash_alloc() kimage_normal_alloc() and kimage_crash_alloc() are doing lot of similar things and differ only little. So instead of having two separate functions create a common function kimage_alloc_init() and pass it the "flags" argument which tells whether it is normal kexec or kexec_on_panic. And this function should be able to deal with both the cases. This consolidation also helps later where we can use a common function kimage_file_alloc_init() to handle normal and crash cases for new file based kexec syscall. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 105 +++++++++++++++++++-------------------------------------- 1 file changed, 34 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 062e5567750e..bfdda316697d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -261,12 +261,20 @@ static struct kimage *do_kimage_alloc_init(void) static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -275,20 +283,26 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->start = entry; - result = copy_user_segment_list(image, nr_segments, segments); - if (result) + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) goto out_free_image; - result = sanity_check_segment_list(image); - if (result) + ret = sanity_check_segment_list(image); + if (ret) goto out_free_image; + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { @@ -296,10 +310,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free_control_pages; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; @@ -308,60 +324,7 @@ out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_image: kfree(image); - return result; -} - -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) - return -EADDRNOTAVAIL; - - /* Allocate and initialize a controlling structure */ - image = do_kimage_alloc_init(); - if (!image) - return -ENOMEM; - - image->start = entry; - - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - - result = copy_user_segment_list(image, nr_segments, segments); - if (result) - goto out_free_image; - - result = sanity_check_segment_list(image); - if (result) - goto out_free_image; - - /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - pr_err("Could not allocate control_code_buffer\n"); - goto out_free_image; - } - - *rimage = image; - return 0; - -out_free_image: - kfree(image); - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -1004,16 +967,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) -- cgit v1.2.3 From 8c86e70acead629aacb4afcd818add66bf6844d9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:50 -0700 Subject: resource: provide new functions to walk through resources I have added two more functions to walk through resources. Currently walk_system_ram_range() deals with pfn and /proc/iomem can contain partial pages. By dealing in pfn, callback function loses the info that last page of a memory range is a partial page and not the full page. So I implemented walk_system_ram_res() which returns u64 values to callback functions and now it properly return start and end address. walk_system_ram_range() uses find_next_system_ram() to find the next ram resource. This in turn only travels through siblings of top level child and does not travers through all the nodes of the resoruce tree. I also need another function where I can walk through all the resources, for example figure out where "GART" aperture is. Figure out where ACPI memory is. So I wrote another function walk_iomem_res() which walks through all /proc/iomem resources and returns matches as asked by caller. Caller can specify "name" of resource, start and end and flags. Got rid of find_next_system_ram_res() and instead implemented more generic find_next_iomem_res() which can be used to traverse top level children only based on an argument. Signed-off-by: Vivek Goyal Cc: Yinghai Lu Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 6 +++ kernel/resource.c | 101 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 98 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5e3a906cc089..142ec544167c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -237,6 +237,12 @@ extern int iomem_is_exclusive(u64 addr); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +extern int +walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); +extern int +walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -364,6 +383,70 @@ static int find_next_system_ram(struct resource *res, char *name) return 0; } +/* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + /* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) -- cgit v1.2.3 From f0895685c7fd8c938c91a9d8a6f7c11f22df58d2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:55 -0700 Subject: kexec: new syscall kexec_file_load() declaration This is the new syscall kexec_file_load() declaration/interface. I have reserved the syscall number only for x86_64 so far. Other architectures (including i386) can reserve syscall number when they enable the support for this new syscall. Signed-off-by: Vivek Goyal Cc: Michael Kerrisk Cc: Borislav Petkov Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 4 ++++ kernel/kexec.c | 7 +++++++ kernel/sys_ni.c | 1 + 4 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index ca2b9aa78c81..35dd922727b9 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -326,6 +326,7 @@ 317 common seccomp sys_seccomp 318 common getrandom sys_getrandom 319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 15a069425cbf..0f86d85a9ce4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -317,6 +317,10 @@ asmlinkage long sys_restart_syscall(void); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags); +asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd, + unsigned long cmdline_len, + const char __user *cmdline_ptr, + unsigned long flags); asmlinkage long sys_exit(int error_code); asmlinkage long sys_exit_group(int error_code); diff --git a/kernel/kexec.c b/kernel/kexec.c index bfdda316697d..ec4386c1b94f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1058,6 +1058,13 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + return -ENOSYS; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1f79e3714533..391d4ddb6f4b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); -- cgit v1.2.3 From cb1052581e2bddd6096544f3f944f4e7fdad4c7f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:57 -0700 Subject: kexec: implementation of new syscall kexec_file_load Previous patch provided the interface definition and this patch prvides implementation of new syscall. Previously segment list was prepared in user space. Now user space just passes kernel fd, initrd fd and command line and kernel will create a segment list internally. This patch contains generic part of the code. Actual segment preparation and loading is done by arch and image specific loader. Which comes in next patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 45 ++++ include/linux/kexec.h | 53 ++++ include/uapi/linux/kexec.h | 11 + kernel/kexec.c | 483 ++++++++++++++++++++++++++++++++++++- 4 files changed, 587 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 679cef0791cd..c8875b5545e1 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -22,6 +22,10 @@ #include #include +static struct kexec_file_ops *kexec_file_loaders[] = { + NULL, +}; + static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); @@ -283,3 +287,44 @@ void arch_crash_save_vmcoreinfo(void) (unsigned long)&_text - __START_KERNEL); } +/* arch-dependent functionality related to kexec file-based syscall */ + +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + int i, ret = -ENOEXEC; + struct kexec_file_ops *fops; + + for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { + fops = kexec_file_loaders[i]; + if (!fops || !fops->probe) + continue; + + ret = fops->probe(buf, buf_len); + if (!ret) { + image->fops = fops; + return ret; + } + } + + return ret; +} + +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image); +} diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 66d56ac0f64c..8e80901e466f 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -121,13 +121,57 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + /* If set, we are using file mode kexec syscall */ + unsigned int file_mode:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; #endif + + /* Additional fields for file based kexec syscall */ + void *kernel_buf; + unsigned long kernel_buf_len; + + void *initrd_buf; + unsigned long initrd_buf_len; + + char *cmdline_buf; + unsigned long cmdline_buf_len; + + /* File operations provided by image loader */ + struct kexec_file_ops *fops; + + /* Image loader handling the kernel can store a pointer here */ + void *image_loader_data; }; +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(struct kimage *image); + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; +}; /* kexec interface functions */ extern void machine_kexec(struct kimage *image); @@ -138,6 +182,11 @@ extern asmlinkage long sys_kexec_load(unsigned long entry, struct kexec_segment __user *segments, unsigned long flags); extern int kernel_kexec(void); +extern int kexec_add_buffer(struct kimage *image, char *buffer, + unsigned long bufsz, unsigned long memsz, + unsigned long buf_align, unsigned long buf_min, + unsigned long buf_max, bool top_down, + unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); extern void crash_kexec(struct pt_regs *); @@ -188,6 +237,10 @@ extern int kexec_load_disabled; #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT) #endif +/* List of defined/legal kexec file flags */ +#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ + KEXEC_FILE_NO_INITRAMFS) + #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index d6629d49a243..6925f5b42f89 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -13,6 +13,17 @@ #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_ARCH_MASK 0xffff0000 +/* + * Kexec file load interface flags. + * KEXEC_FILE_UNLOAD : Unload already loaded kexec/kdump image. + * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. + * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd + * fd field. + */ +#define KEXEC_FILE_UNLOAD 0x00000001 +#define KEXEC_FILE_ON_CRASH 0x00000002 +#define KEXEC_FILE_NO_INITRAMFS 0x00000004 + /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. */ diff --git a/kernel/kexec.c b/kernel/kexec.c index ec4386c1b94f..9b46219254dd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -327,6 +329,221 @@ out_free_image: return ret; } +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) +{ + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; + goto out; + } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; + goto out; + } + + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + pr_err("Could not allocate control_code_buffer\n"); + goto out_free_post_load_bufs; + } + + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } + + *rimage = image; + return 0; +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); + kfree(image->image_loader_data); +out_free_image: + kfree(image); + return ret; +} + static int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) @@ -644,6 +861,16 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + kfree(image->image_loader_data); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -772,10 +999,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -807,7 +1038,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -815,7 +1050,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -1062,7 +1300,72 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) { - return -ENOSYS; + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; } void crash_kexec(struct pt_regs *regs) @@ -1620,6 +1923,176 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. -- cgit v1.2.3 From 12db5562e0352986a265841638482b84f3a6899b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:04 -0700 Subject: kexec: load and relocate purgatory at kernel load time Load purgatory code in RAM and relocate it based on the location. Relocation code has been inspired by module relocation code and purgatory relocation code in kexec-tools. Also compute the checksums of loaded kexec segments and store them in purgatory. Arch independent code provides this functionality so that arch dependent bootloaders can make use of it. Helper functions are provided to get/set symbol values in purgatory which are used by bootloaders later to set things like stack and entry point of second kernel etc. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 + arch/ia64/Kconfig | 2 + arch/m68k/Kconfig | 2 + arch/mips/Kconfig | 2 + arch/powerpc/Kconfig | 2 + arch/s390/Kconfig | 2 + arch/sh/Kconfig | 2 + arch/tile/Kconfig | 2 + arch/x86/Kconfig | 2 + arch/x86/kernel/machine_kexec_64.c | 142 ++++++++++ include/linux/kexec.h | 33 +++ kernel/kexec.c | 544 ++++++++++++++++++++++++++++++++++++- 12 files changed, 736 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8e9dbcbcf5af..cacc8d5355b3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2065,6 +2065,8 @@ config XIP_PHYS_ADDR config KEXEC bool "Kexec system call (EXPERIMENTAL)" depends on (!SMP || PM_SLEEP_SMP) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c84c88bbbbd7..64aefb76bd69 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -549,6 +549,8 @@ source "drivers/sn/Kconfig" config KEXEC bool "kexec system call" depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 87b7c7581b1d..3ff8c9a25335 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -91,6 +91,8 @@ config MMU_SUN3 config KEXEC bool "kexec system call" depends on M68KCLASSIC + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 900c7e5333b6..df51e78a72cc 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2396,6 +2396,8 @@ source "kernel/Kconfig.preempt" config KEXEC bool "Kexec system call" + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4bc7b62fb4b6..a577609f8ed6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -399,6 +399,8 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 05c78bb5f570..ab39ceb89ecf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -48,6 +48,8 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KEXEC def_bool y + select CRYPTO + select CRYPTO_SHA256 config AUDIT_ARCH def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index aa2df3eaeb29..453fa5c09550 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -595,6 +595,8 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" depends on SUPERH32 && MMU + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 7fcd492adbfc..a3ffe2dd4832 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -191,6 +191,8 @@ source "kernel/Kconfig.hz" config KEXEC bool "kexec system call" + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 98fe3df6df82..9558b9fcafbf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1583,6 +1583,8 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c8875b5545e1..88404c440727 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -328,3 +330,143 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image); } + +/* + * Apply purgatory relocations. + * + * ehdr: Pointer to elf headers + * sechdrs: Pointer to section headers. + * relsec: section index of SHT_RELA section. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, + Elf64_Shdr *sechdrs, unsigned int relsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + Elf64_Shdr *section, *symtabsec; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + + /* + * ->sh_offset has been modified to keep the pointer to section + * contents in memory + */ + rel = (void *)sechdrs[relsec].sh_offset; + + /* Section to which relocations apply */ + section = &sechdrs[sechdrs[relsec].sh_info]; + + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* Associated symbol table */ + symtabsec = &sechdrs[sechdrs[relsec].sh_link]; + + /* String table */ + if (symtabsec->sh_link >= ehdr->e_shnum) { + /* Invalid strtab section number */ + pr_err("Invalid string table section index %d\n", + symtabsec->sh_link); + return -ENOEXEC; + } + + strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + + /* section header string table */ + shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update (->sh_offset). This is temporary + * buffer where section is currently loaded. This will finally + * be loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = (void *)(section->sh_offset + rel[i].r_offset); + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (Elf64_Sym *)symtabsec->sh_offset + + ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8e80901e466f..84f09e9eca26 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -10,6 +10,7 @@ #include #include #include +#include #include /* Verify architecture specific macros are defined */ @@ -95,6 +96,27 @@ struct compat_kexec_segment { }; #endif +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +struct purgatory_info { + /* Pointer to elf header of read only purgatory */ + Elf_Ehdr *ehdr; + + /* Pointer to purgatory sechdrs which are modifiable */ + Elf_Shdr *sechdrs; + /* + * Temporary buffer location where purgatory is loaded and relocated + * This memory can be freed post image load + */ + void *purgatory_buf; + + /* Address where purgatory is finally loaded and is executed from */ + unsigned long purgatory_load_addr; +}; + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -143,6 +165,9 @@ struct kimage { /* Image loader handling the kernel can store a pointer here */ void *image_loader_data; + + /* Information for loading purgatory */ + struct purgatory_info purgatory_info; }; /* @@ -189,6 +214,14 @@ extern int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr); +extern int kexec_purgatory_get_set_symbol(struct kimage *image, + const char *name, void *buf, + unsigned int size, bool get_value); +extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, + const char *name); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); diff --git a/kernel/kexec.c b/kernel/kexec.c index 9b46219254dd..669e331aa9ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -42,6 +42,9 @@ #include #include +#include +#include + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -54,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -404,6 +416,24 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { } +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + /* * Free up memory used by kernel, initrd, and comand line. This is temporary * memory allocation which is not needed any more after these buffers have @@ -411,6 +441,8 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) */ static void kimage_file_post_load_cleanup(struct kimage *image) { + struct purgatory_info *pi = &image->purgatory_info; + vfree(image->kernel_buf); image->kernel_buf = NULL; @@ -420,6 +452,12 @@ static void kimage_file_post_load_cleanup(struct kimage *image) kfree(image->cmdline_buf); image->cmdline_buf = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); } @@ -1105,7 +1143,7 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + buf += mchunk; mbytes -= mchunk; } out: @@ -1340,6 +1378,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + for (i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -2092,6 +2134,506 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, return 0; } +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} /* * Move into place and start executing a preloaded standalone -- cgit v1.2.3 From 27f48d3e633be23656a097baa3be336e04a82d84 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:06 -0700 Subject: kexec-bzImage64: support for loading bzImage using 64bit entry This is loader specific code which can load bzImage and set it up for 64bit entry. This does not take care of 32bit entry or real mode entry. 32bit mode entry can be implemented if somebody needs it. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kexec-bzimage64.h | 6 + arch/x86/include/asm/kexec.h | 21 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kexec-bzimage64.c | 375 +++++++++++++++++++++++++++++++++ arch/x86/kernel/machine_kexec_64.c | 5 +- include/linux/kexec.h | 2 +- kernel/kexec.c | 11 +- 7 files changed, 415 insertions(+), 6 deletions(-) create mode 100644 arch/x86/include/asm/kexec-bzimage64.h create mode 100644 arch/x86/kernel/kexec-bzimage64.c (limited to 'kernel') diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h new file mode 100644 index 000000000000..d1b5d194e31d --- /dev/null +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KEXEC_BZIMAGE64_H +#define _ASM_KEXEC_BZIMAGE64_H + +extern struct kexec_file_ops kexec_bzImage64_ops; + +#endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 17483a492f18..0dfccced4edf 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -23,6 +23,7 @@ #include #include +#include /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -161,6 +162,26 @@ struct kimage_arch { pmd_t *pmd; pte_t *pte; }; + +struct kexec_entry64_regs { + uint64_t rax; + uint64_t rbx; + uint64_t rcx; + uint64_t rdx; + uint64_t rsi; + uint64_t rdi; + uint64_t rsp; + uint64_t rbp; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rip; +}; #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bde3993624f1..b5ea75c4a4b4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -118,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000000..bcedd100192f --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,375 @@ +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys; + uint32_t cmdline_low_32, cmdline_ext_32; + + memcpy(cmdline_ptr, cmdline, cmdline_len); + cmdline_ptr[cmdline_len - 1] = '\0'; + + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_memory_map_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_saved.nr_map; + + /* TODO: Pass entries more than E820MAX in bootparams setup data */ + if (nr_e820_entries > E820MAX) + nr_e820_entries = E820MAX; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_map, &e820_saved.map, + nr_e820_entries * sizeof(struct e820entry)); + + return 0; +} + +static int setup_boot_parameters(struct boot_params *params) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &boot_params.screen_info, + sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + /* Default sysdesc table */ + params->sys_desc_table.length = 0; + + setup_memory_map_entries(params); + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_map[i].type != E820_RAM) + continue; + start = params->e820_map[i].addr; + end = params->e820_map[i].addr + params->e820_map[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return 0; +} + +int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be atleast two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + unsigned long purgatory_load_addr; + unsigned long kernel_bufsz, kernel_memsz, kernel_align; + char *kernel_buf; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, + &purgatory_load_addr); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + + /* Load Bootparams and cmdline */ + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len; + params = kzalloc(params_cmdline_sz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + + /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + ret = kexec_add_buffer(image, (char *)params, params_cmdline_sz, + params_cmdline_sz, 16, MIN_BOOTPARAM_ADDR, + ULONG_MAX, 1, &bootparam_load_addr); + if (ret) + goto out_free_params; + pr_debug("Loaded boot_param and command line at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, params_cmdline_sz, params_cmdline_sz); + + /* Load kernel */ + kernel_buf = kernel + kern16_size; + kernel_bufsz = kernel_len - kern16_size; + kernel_memsz = PAGE_ALIGN(header->init_size); + kernel_align = header->kernel_alignment; + + ret = kexec_add_buffer(image, kernel_buf, + kernel_bufsz, kernel_memsz, kernel_align, + MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, + &kernel_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kernel_memsz, kernel_memsz); + + /* Load initrd high */ + if (initrd) { + ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, + PAGE_SIZE, MIN_INITRD_LOAD_ADDR, + ULONG_MAX, 1, &initrd_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(params, bootparam_load_addr, sizeof(struct boot_params), + cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + setup_boot_parameters(params); + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +}; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 88404c440727..18d0f9e0b6da 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -23,9 +23,10 @@ #include #include #include +#include static struct kexec_file_ops *kexec_file_loaders[] = { - NULL, + &kexec_bzImage64_ops, }; static void free_transition_pgtable(struct kimage *image) @@ -328,7 +329,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) if (!image->fops || !image->fops->cleanup) return 0; - return image->fops->cleanup(image); + return image->fops->cleanup(image->image_loader_data); } /* diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 84f09e9eca26..9481703b0e7a 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -190,7 +190,7 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len); -typedef int (kexec_cleanup_t)(struct kimage *image); +typedef int (kexec_cleanup_t)(void *loader_data); struct kexec_file_ops { kexec_probe_t *probe; diff --git a/kernel/kexec.c b/kernel/kexec.c index 669e331aa9ec..0926f2a3ed03 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -460,6 +460,14 @@ static void kimage_file_post_load_cleanup(struct kimage *image) /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); + + /* + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. + */ + kfree(image->image_loader_data); + image->image_loader_data = NULL; } /* @@ -576,7 +584,6 @@ out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_post_load_bufs: kimage_file_post_load_cleanup(image); - kfree(image->image_loader_data); out_free_image: kfree(image); return ret; @@ -900,8 +907,6 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); - kfree(image->image_loader_data); - /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. -- cgit v1.2.3 From dd5f726076cc7639d9713b334c8c133f77c6757a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:09 -0700 Subject: kexec: support for kexec on panic using new system call This patch adds support for loading a kexec on panic (kdump) kernel usning new system call. It prepares ELF headers for memory areas to be dumped and for saved cpu registers. Also prepares the memory map for second kernel and limits its boot to reserved areas only. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/crash.h | 9 + arch/x86/include/asm/kexec.h | 30 +- arch/x86/kernel/crash.c | 563 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/kexec-bzimage64.c | 55 +++- arch/x86/kernel/machine_kexec_64.c | 40 +++ arch/x86/purgatory/entry64.S | 6 +- kernel/kexec.c | 46 ++- 7 files changed, 724 insertions(+), 25 deletions(-) create mode 100644 arch/x86/include/asm/crash.h (limited to 'kernel') diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h new file mode 100644 index 000000000000..f498411f2500 --- /dev/null +++ b/arch/x86/include/asm/crash.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_CRASH_H +#define _ASM_X86_CRASH_H + +int crash_load_segments(struct kimage *image); +int crash_copy_backup_region(struct kimage *image); +int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + +#endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0dfccced4edf..d2434c1cad05 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -25,6 +25,8 @@ #include #include +struct kimage; + /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, @@ -62,6 +64,10 @@ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif +/* Memory to backup during crash kdump */ +#define KEXEC_BACKUP_SRC_START (0UL) +#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ + /* * CPU does not save ss and sp on stack if execution is already * running in kernel mode at the time of NMI occurrence. This code @@ -161,17 +167,35 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; + /* Details of backup region */ + unsigned long backup_src_start; + unsigned long backup_src_sz; + + /* Physical address of backup segment */ + unsigned long backup_load_addr; + + /* Core ELF header buffer */ + void *elf_headers; + unsigned long elf_headers_sz; + unsigned long elf_load_addr; }; +#endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_64 +/* + * Number of elements and order of elements in this structure should match + * with the ones in arch/x86/purgatory/entry64.S. If you make a change here + * make an appropriate change in purgatory too. + */ struct kexec_entry64_regs { uint64_t rax; - uint64_t rbx; uint64_t rcx; uint64_t rdx; - uint64_t rsi; - uint64_t rdi; + uint64_t rbx; uint64_t rsp; uint64_t rbp; + uint64_t rsi; + uint64_t rdi; uint64_t r8; uint64_t r9; uint64_t r10; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 507de8066594..0553a34fa0df 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -4,9 +4,14 @@ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal * */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -16,6 +21,7 @@ #include #include #include +#include #include #include @@ -28,6 +34,45 @@ #include #include +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +/* This primarily represents number of split ranges due to exclusion */ +#define CRASH_MAX_RANGES 16 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int nr_ranges; + struct crash_mem_range ranges[CRASH_MAX_RANGES]; +}; + +/* Misc data about ram ranges needed to prepare elf headers */ +struct crash_elf_data { + struct kimage *image; + /* + * Total number of ram ranges we have after various adjustments for + * GART, crash reserved region etc. + */ + unsigned int max_nr_ranges; + unsigned long gart_start, gart_end; + + /* Pointer to elf header */ + void *ehdr; + /* Pointer to next phdr */ + void *bufp; + struct crash_mem mem; +}; + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + int in_crash_kexec; /* @@ -39,6 +84,7 @@ int in_crash_kexec; */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif crash_save_cpu(regs, safe_smp_processor_id()); } + +#ifdef CONFIG_X86_64 + +static int get_nr_ram_ranges_callback(unsigned long start_pfn, + unsigned long nr_pfn, void *arg) +{ + int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int get_gart_ranges_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + + ced->gart_start = start; + ced->gart_end = end; + + /* Not expecting more than 1 gart aperture */ + return 1; +} + + +/* Gather all the required information to prepare elf headers for ram regions */ +static void fill_up_crash_elf_data(struct crash_elf_data *ced, + struct kimage *image) +{ + unsigned int nr_ranges = 0; + + ced->image = image; + + walk_system_ram_range(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; + + /* + * We don't create ELF headers for GART aperture as an attempt + * to dump this memory in second kernel leads to hang/crash. + * If gart aperture is present, one needs to exclude that region + * and that could lead to need of extra phdr. + */ + walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, + ced, get_gart_ranges_callback); + + /* + * If we have gart region, excluding that could potentially split + * a memory range, resulting in extra header. Account for that. + */ + if (ced->gart_end) + ced->max_nr_ranges++; + + /* Exclusion of crash region could split memory ranges */ + ced->max_nr_ranges++; + + /* If crashk_low_res is not 0, another range split possible */ + if (crashk_low_res.end != 0) + ced->max_nr_ranges++; +} + +static int exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happend, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == CRASH_MAX_RANGES - 1) { + pr_err("Too many crash ranges after split\n"); + return -ENOMEM; + } + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in ced->mem.ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_elf_data *ced, + unsigned long long mstart, unsigned long long mend) +{ + struct crash_mem *cmem = &ced->mem; + int ret = 0; + + memset(cmem->ranges, 0, sizeof(cmem->ranges)); + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude crashkernel region */ + ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; + + /* Exclude GART region */ + if (ced->gart_end) { + ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); + if (ret) + return ret; + } + + return ret; +} + +static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long mstart, mend; + struct kimage *image = ced->image; + struct crash_mem *cmem; + int ret, i; + + ehdr = ced->ehdr; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(ced, start, end); + if (ret) + return ret; + + /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ + cmem = &ced->mem; + + for (i = 0; i < cmem->nr_ranges; i++) { + mstart = cmem->ranges[i].start; + mend = cmem->ranges[i].end; + + phdr = ced->bufp; + ced->bufp += sizeof(Elf64_Phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + if (mstart == image->arch.backup_src_start && + (mend - mstart + 1) == image->arch.backup_src_sz) + phdr->p_offset = image->arch.backup_load_addr; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + return ret; +} + +static int prepare_elf64_headers(struct crash_elf_data *ced, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf, *bufp; + unsigned int cpu; + unsigned long long notes_addr; + int ret; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += ced->max_nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area on x86_64 (ffffffff80000000 - ffffffffa0000000). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + bufp = buf; + ehdr = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + (ehdr->e_phnum)++; + +#ifdef CONFIG_X86_64 + /* Prepare PT_LOAD type program header for kernel text region */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + (ehdr->e_phnum)++; +#endif + + /* Prepare PT_LOAD headers for system ram chunks. */ + ced->ehdr = ehdr; + ced->bufp = bufp; + ret = walk_system_ram_res(0, -1, ced, + prepare_elf64_ram_headers_callback); + if (ret < 0) + return ret; + + *addr = buf; + *sz = elf_sz; + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_elf_data *ced; + int ret; + + ced = kzalloc(sizeof(*ced), GFP_KERNEL); + if (!ced) + return -ENOMEM; + + fill_up_crash_elf_data(ced, image); + + /* By default prepare 64bit headers */ + ret = prepare_elf64_headers(ced, addr, sz); + kfree(ced); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820MAX) + return 1; + + memcpy(¶ms->e820_map[nr_e820_entries], entry, + sizeof(struct e820entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(u64 start, u64 end, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820entry ei; + + ei.addr = start; + ei.size = end - start + 1; + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + int ret = 0; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude Backup region */ + start = image->arch.backup_load_addr; + end = start + image->arch.backup_src_sz - 1; + ret = exclude_mem_range(cmem, start, end); + if (ret) + return ret; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(sizeof(struct crash_mem)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add first 640K segment */ + ei.addr = image->arch.backup_src_start; + ei.size = image->arch.backup_src_sz; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + + /* Add ACPI tables */ + cmd.type = E820_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_NVS; + walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, + crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +static int determine_backup_region(u64 start, u64 end, void *arg) +{ + struct kimage *image = arg; + + image->arch.backup_src_start = start; + image->arch.backup_src_sz = end - start + 1; + + /* Expecting only one range for backup region */ + return 1; +} + +int crash_load_segments(struct kimage *image) +{ + unsigned long src_start, src_sz, elf_sz; + void *elf_addr; + int ret; + + /* + * Determine and load a segment for backup area. First 640K RAM + * region is backup source + */ + + ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, + image, determine_backup_region); + + /* Zero or postive return values are ok */ + if (ret < 0) + return ret; + + src_start = image->arch.backup_src_start; + src_sz = image->arch.backup_src_sz; + + /* Add backup segment. */ + if (src_sz) { + /* + * Ideally there is no source for backup segment. This is + * copied in purgatory after crash. Just add a zero filled + * segment for now to make sure checksum logic works fine. + */ + ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, + sizeof(crash_zero_bytes), src_sz, + PAGE_SIZE, 0, -1, 0, + &image->arch.backup_load_addr); + if (ret) + return ret; + pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", + image->arch.backup_load_addr, src_start, src_sz); + } + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + if (ret) + return ret; + + image->arch.elf_headers = elf_addr; + image->arch.elf_headers_sz = elf_sz; + + ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, + ELF_CORE_HEADER_ALIGN, 0, -1, 0, + &image->arch.elf_load_addr); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, elf_sz, elf_sz); + + return ret; +} + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index bcedd100192f..a8e646458a10 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -21,6 +21,9 @@ #include #include +#include + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ /* * Defines lowest physical address for various segments. Not sure where @@ -58,18 +61,24 @@ static int setup_initrd(struct boot_params *params, return 0; } -static int setup_cmdline(struct boot_params *params, +static int setup_cmdline(struct kimage *image, struct boot_params *params, unsigned long bootparams_load_addr, unsigned long cmdline_offset, char *cmdline, unsigned long cmdline_len) { char *cmdline_ptr = ((char *)params) + cmdline_offset; - unsigned long cmdline_ptr_phys; + unsigned long cmdline_ptr_phys, len; uint32_t cmdline_low_32, cmdline_ext_32; memcpy(cmdline_ptr, cmdline, cmdline_len); + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr + cmdline_len - 1, + " elfcorehdr=0x%lx", image->arch.elf_load_addr); + cmdline_len += len; + } cmdline_ptr[cmdline_len - 1] = '\0'; + pr_debug("Final command line is: %s\n", cmdline_ptr); cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; cmdline_ext_32 = cmdline_ptr_phys >> 32; @@ -98,11 +107,12 @@ static int setup_memory_map_entries(struct boot_params *params) return 0; } -static int setup_boot_parameters(struct boot_params *params) +static int setup_boot_parameters(struct kimage *image, + struct boot_params *params) { unsigned int nr_e820_entries; unsigned long long mem_k, start, end; - int i; + int i, ret = 0; /* Get subarch from existing bootparams */ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; @@ -125,7 +135,13 @@ static int setup_boot_parameters(struct boot_params *params) /* Default sysdesc table */ params->sys_desc_table.length = 0; - setup_memory_map_entries(params); + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_memory_map_entries(params); + nr_e820_entries = params->e820_entries; for (i = 0; i < nr_e820_entries; i++) { @@ -153,7 +169,7 @@ static int setup_boot_parameters(struct boot_params *params) memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, EDD_MBR_SIG_MAX * sizeof(unsigned int)); - return 0; + return ret; } int bzImage64_probe(const char *buf, unsigned long len) @@ -240,6 +256,22 @@ void *bzImage64_load(struct kimage *image, char *kernel, return ERR_PTR(-EINVAL); } + /* + * In case of crash dump, we will append elfcorehdr= to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr= to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + /* * Load purgatory. For 64bit entry point, purgatory code can be * anywhere. @@ -254,7 +286,8 @@ void *bzImage64_load(struct kimage *image, char *kernel, pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); /* Load Bootparams and cmdline */ - params_cmdline_sz = sizeof(struct boot_params) + cmdline_len; + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; params = kzalloc(params_cmdline_sz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); @@ -303,8 +336,8 @@ void *bzImage64_load(struct kimage *image, char *kernel, setup_initrd(params, initrd_load_addr, initrd_len); } - setup_cmdline(params, bootparam_load_addr, sizeof(struct boot_params), - cmdline, cmdline_len); + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); /* bootloader info. Do we need a separate ID for kexec kernel loader? */ params->hdr.type_of_loader = 0x0D << 4; @@ -332,7 +365,9 @@ void *bzImage64_load(struct kimage *image, char *kernel, if (ret) goto out_free_params; - setup_boot_parameters(params); + ret = setup_boot_parameters(image, params); + if (ret) + goto out_free_params; /* Allocate loader specific data */ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 18d0f9e0b6da..9330434da777 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -178,6 +178,38 @@ static void load_segments(void) ); } +/* Update purgatory as needed after various image segments have been prepared */ +static int arch_update_purgatory(struct kimage *image) +{ + int ret = 0; + + if (!image->file_mode) + return 0; + + /* Setup copying of backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + &image->arch.backup_load_addr, + sizeof(image->arch.backup_load_addr), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_src", + &image->arch.backup_src_start, + sizeof(image->arch.backup_src_start), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + &image->arch.backup_src_sz, + sizeof(image->arch.backup_src_sz), 0); + if (ret) + return ret; + } + + return ret; +} + int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -191,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; + /* update purgatory as needed */ + result = arch_update_purgatory(image); + if (result) + return result; + return 0; } @@ -315,6 +352,9 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, void *arch_kexec_kernel_image_load(struct kimage *image) { + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S index be3249d7ed2d..d1a4291d3568 100644 --- a/arch/x86/purgatory/entry64.S +++ b/arch/x86/purgatory/entry64.S @@ -61,13 +61,13 @@ new_cs_exit: .balign 4 entry64_regs: rax: .quad 0x0 -rbx: .quad 0x0 rcx: .quad 0x0 rdx: .quad 0x0 -rsi: .quad 0x0 -rdi: .quad 0x0 +rbx: .quad 0x0 rsp: .quad 0x0 rbp: .quad 0x0 +rsi: .quad 0x0 +rdi: .quad 0x0 r8: .quad 0x0 r9: .quad 0x0 r10: .quad 0x0 diff --git a/kernel/kexec.c b/kernel/kexec.c index 0926f2a3ed03..f18c780f9716 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -548,6 +548,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, { int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; image = do_kimage_alloc_init(); if (!image) @@ -555,6 +556,12 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, image->file_mode = 1; + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); if (ret) @@ -572,10 +579,12 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, goto out_free_post_load_bufs; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); - goto out_free_control_pages; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; @@ -1113,10 +1122,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -1139,7 +1152,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -1148,7 +1166,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -2127,7 +2148,14 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, kbuf->top_down = top_down; /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); if (ret != 1) { /* A suitable memory range could not be found for buffer */ return -EADDRNOTAVAIL; -- cgit v1.2.3 From 8e7d838103feac320baf9e68d73f954840ac1eea Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:13 -0700 Subject: kexec: verify the signature of signed PE bzImage This is the final piece of the puzzle of verifying kernel image signature during kexec_file_load() syscall. This patch calls into PE file routines to verify signature of bzImage. If signature are valid, kexec_file_load() succeeds otherwise it fails. Two new config options have been introduced. First one is CONFIG_KEXEC_VERIFY_SIG. This option enforces that kernel has to be validly signed otherwise kernel load will fail. If this option is not set, no signature verification will be done. Only exception will be when secureboot is enabled. In that case signature verification should be automatically enforced when secureboot is enabled. But that will happen when secureboot patches are merged. Second config option is CONFIG_KEXEC_BZIMAGE_VERIFY_SIG. This option enables signature verification support on bzImage. If this option is not set and previous one is set, kernel image loading will fail because kernel does not have support to verify signature of bzImage. I tested these patches with both "pesign" and "sbsign" signed bzImages. I used signing_key.priv key and signing_key.x509 cert for signing as generated during kernel build process (if module signing is enabled). Used following method to sign bzImage. pesign ====== - Convert DER format cert to PEM format cert openssl x509 -in signing_key.x509 -inform DER -out signing_key.x509.PEM -outform PEM - Generate a .p12 file from existing cert and private key file openssl pkcs12 -export -out kernel-key.p12 -inkey signing_key.priv -in signing_key.x509.PEM - Import .p12 file into pesign db pk12util -i /tmp/kernel-key.p12 -d /etc/pki/pesign - Sign bzImage pesign -i /boot/vmlinuz-3.16.0-rc3+ -o /boot/vmlinuz-3.16.0-rc3+.signed.pesign -c "Glacier signing key - Magrathea" -s sbsign ====== sbsign --key signing_key.priv --cert signing_key.x509.PEM --output /boot/vmlinuz-3.16.0-rc3+.signed.sbsign /boot/vmlinuz-3.16.0-rc3+ Patch details: Well all the hard work is done in previous patches. Now bzImage loader has just call into that code and verify whether bzImage signature are valid or not. Also create two config options. First one is CONFIG_KEXEC_VERIFY_SIG. This option enforces that kernel has to be validly signed otherwise kernel load will fail. If this option is not set, no signature verification will be done. Only exception will be when secureboot is enabled. In that case signature verification should be automatically enforced when secureboot is enabled. But that will happen when secureboot patches are merged. Second config option is CONFIG_KEXEC_BZIMAGE_VERIFY_SIG. This option enables signature verification support on bzImage. If this option is not set and previous one is set, kernel image loading will fail because kernel does not have support to verify signature of bzImage. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Cc: Matt Fleming Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 22 ++++++++++++++++++++++ arch/x86/kernel/kexec-bzimage64.c | 21 +++++++++++++++++++++ arch/x86/kernel/machine_kexec_64.c | 11 +++++++++++ include/linux/kexec.h | 3 +++ kernel/kexec.c | 15 +++++++++++++++ 5 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9558b9fcafbf..4aafd322e21e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1599,6 +1599,28 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC + ---help--- + This option makes kernel signature verification mandatory for + kexec_file_load() syscall. If kernel is signature can not be + verified, kexec_file_load() will fail. + + This option enforces signature verification at generic level. + One needs to enable signature verification for type of kernel + image being loaded to make sure it works. For example, enable + bzImage signature verification option to be able to load and + verify signatures of bzImage. Otherwise kernel loading will fail. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on KEXEC_VERIFY_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + ---help--- + Enable bzImage signature verification support. + config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 623e6c58081f..9642b9b33655 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -525,8 +527,27 @@ int bzImage64_cleanup(void *loader_data) return 0; } +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG +int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) +{ + bool trusted; + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + system_trusted_keyring, &trusted); + if (ret < 0) + return ret; + if (!trusted) + return -EKEYREJECTED; + return 0; +} +#endif + struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = bzImage64_verify_sig, +#endif }; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9330434da777..8b04018e5d1f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -372,6 +372,17 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } +int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, + unsigned long kernel_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification."); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(kernel, kernel_len); +} + /* * Apply purgatory relocations. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9481703b0e7a..4b2a0e11cc5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -191,11 +191,14 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); +typedef int (kexec_verify_sig_t)(const char *kernel_buf, + unsigned long kernel_len); struct kexec_file_ops { kexec_probe_t *probe; kexec_load_t *load; kexec_cleanup_t *cleanup; + kexec_verify_sig_t *verify_sig; }; /* kexec interface functions */ diff --git a/kernel/kexec.c b/kernel/kexec.c index f18c780f9716..0b49a0a58102 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -416,6 +416,12 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { } +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + /* Apply relocations of type RELA */ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, @@ -494,6 +500,15 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, -- cgit v1.2.3