diff options
Diffstat (limited to 'kernel/bpf/stackmap.c')
| -rw-r--r-- | kernel/bpf/stackmap.c | 257 | 
1 files changed, 235 insertions, 22 deletions
| diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@  #include <linux/filter.h>  #include <linux/stacktrace.h>  #include <linux/perf_event.h> +#include <linux/elf.h> +#include <linux/pagemap.h>  #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK					\ +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\ +	 BPF_F_STACK_BUILD_ID)  struct stack_map_bucket {  	struct pcpu_freelist_node fnode;  	u32 hash;  	u32 nr; -	u64 ip[]; +	u64 data[];  };  struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map {  	struct stack_map_bucket *buckets[];  }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ +	return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ +	return stack_map_use_build_id(map) ? +		sizeof(struct bpf_stack_build_id) : sizeof(u64); +} +  static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)  {  	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || -	    value_size < 8 || value_size % 8 || -	    value_size / 8 > sysctl_perf_event_max_stack) +	    value_size < 8 || value_size % 8) +		return ERR_PTR(-EINVAL); + +	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); +	if (attr->map_flags & BPF_F_STACK_BUILD_ID) { +		if (value_size % sizeof(struct bpf_stack_build_id) || +		    value_size / sizeof(struct bpf_stack_build_id) +		    > sysctl_perf_event_max_stack) +			return ERR_PTR(-EINVAL); +	} else if (value_size / 8 > sysctl_perf_event_max_stack)  		return ERR_PTR(-EINVAL);  	/* hash table size must be power of 2 */ @@ -114,13 +136,184 @@ free_smap:  	return ERR_PTR(err);  } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, +					   unsigned char *build_id, +					   void *note_start, +					   Elf32_Word note_size) +{ +	Elf32_Word note_offs = 0, new_offs; + +	/* check for overflow */ +	if (note_start < page_addr || note_start + note_size < note_start) +		return -EINVAL; + +	/* only supports note that fits in the first page */ +	if (note_start + note_size > page_addr + PAGE_SIZE) +		return -EINVAL; + +	while (note_offs + sizeof(Elf32_Nhdr) < note_size) { +		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + +		if (nhdr->n_type == BPF_BUILD_ID && +		    nhdr->n_namesz == sizeof("GNU") && +		    nhdr->n_descsz == BPF_BUILD_ID_SIZE) { +			memcpy(build_id, +			       note_start + note_offs + +			       ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), +			       BPF_BUILD_ID_SIZE); +			return 0; +		} +		new_offs = note_offs + sizeof(Elf32_Nhdr) + +			ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); +		if (new_offs <= note_offs)  /* overflow */ +			break; +		note_offs = new_offs; +	} +	return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, +				     unsigned char *build_id) +{ +	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; +	Elf32_Phdr *phdr; +	int i; + +	/* only supports phdr that fits in one page */ +	if (ehdr->e_phnum > +	    (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) +		return -EINVAL; + +	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + +	for (i = 0; i < ehdr->e_phnum; ++i) +		if (phdr[i].p_type == PT_NOTE) +			return stack_map_parse_build_id(page_addr, build_id, +					page_addr + phdr[i].p_offset, +					phdr[i].p_filesz); +	return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, +				     unsigned char *build_id) +{ +	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; +	Elf64_Phdr *phdr; +	int i; + +	/* only supports phdr that fits in one page */ +	if (ehdr->e_phnum > +	    (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) +		return -EINVAL; + +	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + +	for (i = 0; i < ehdr->e_phnum; ++i) +		if (phdr[i].p_type == PT_NOTE) +			return stack_map_parse_build_id(page_addr, build_id, +					page_addr + phdr[i].p_offset, +					phdr[i].p_filesz); +	return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, +				  unsigned char *build_id) +{ +	Elf32_Ehdr *ehdr; +	struct page *page; +	void *page_addr; +	int ret; + +	/* only works for page backed storage  */ +	if (!vma->vm_file) +		return -EINVAL; + +	page = find_get_page(vma->vm_file->f_mapping, 0); +	if (!page) +		return -EFAULT;	/* page not mapped */ + +	ret = -EINVAL; +	page_addr = page_address(page); +	ehdr = (Elf32_Ehdr *)page_addr; + +	/* compare magic x7f "ELF" */ +	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) +		goto out; + +	/* only support executable file and shared object file */ +	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) +		goto out; + +	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) +		ret = stack_map_get_build_id_32(page_addr, build_id); +	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) +		ret = stack_map_get_build_id_64(page_addr, build_id); +out: +	put_page(page); +	return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, +					  struct stack_map_bucket *bucket, +					  u64 *ips, u32 trace_nr, bool user) +{ +	int i; +	struct vm_area_struct *vma; +	struct bpf_stack_build_id *id_offs; + +	bucket->nr = trace_nr; +	id_offs = (struct bpf_stack_build_id *)bucket->data; + +	/* +	 * We cannot do up_read() in nmi context, so build_id lookup is +	 * only supported for non-nmi events. If at some point, it is +	 * possible to run find_vma() without taking the semaphore, we +	 * would like to allow build_id lookup in nmi context. +	 * +	 * Same fallback is used for kernel stack (!user) on a stackmap +	 * with build_id. +	 */ +	if (!user || !current || !current->mm || in_nmi() || +	    down_read_trylock(¤t->mm->mmap_sem) == 0) { +		/* cannot access current->mm, fall back to ips */ +		for (i = 0; i < trace_nr; i++) { +			id_offs[i].status = BPF_STACK_BUILD_ID_IP; +			id_offs[i].ip = ips[i]; +		} +		return; +	} + +	for (i = 0; i < trace_nr; i++) { +		vma = find_vma(current->mm, ips[i]); +		if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { +			/* per entry fall back to ips */ +			id_offs[i].status = BPF_STACK_BUILD_ID_IP; +			id_offs[i].ip = ips[i]; +			continue; +		} +		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] +			- vma->vm_start; +		id_offs[i].status = BPF_STACK_BUILD_ID_VALID; +	} +	up_read(¤t->mm->mmap_sem); +} +  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,  	   u64, flags)  {  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);  	struct perf_callchain_entry *trace;  	struct stack_map_bucket *bucket, *new_bucket, *old_bucket; -	u32 max_depth = map->value_size / 8; +	u32 max_depth = map->value_size / stack_map_data_size(map);  	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */  	u32 init_nr = sysctl_perf_event_max_stack - max_depth;  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,  	bool user = flags & BPF_F_USER_STACK;  	bool kernel = !user;  	u64 *ips; +	bool hash_matches;  	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |  			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,  	id = hash & (smap->n_buckets - 1);  	bucket = READ_ONCE(smap->buckets[id]); -	if (bucket && bucket->hash == hash) { -		if (flags & BPF_F_FAST_STACK_CMP) +	hash_matches = bucket && bucket->hash == hash; +	/* fast cmp */ +	if (hash_matches && flags & BPF_F_FAST_STACK_CMP) +		return id; + +	if (stack_map_use_build_id(map)) { +		/* for build_id+offset, pop a bucket before slow cmp */ +		new_bucket = (struct stack_map_bucket *) +			pcpu_freelist_pop(&smap->freelist); +		if (unlikely(!new_bucket)) +			return -ENOMEM; +		stack_map_get_build_id_offset(map, new_bucket, ips, +					      trace_nr, user); +		trace_len = trace_nr * sizeof(struct bpf_stack_build_id); +		if (hash_matches && bucket->nr == trace_nr && +		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) { +			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);  			return id; -		if (bucket->nr == trace_nr && -		    memcmp(bucket->ip, ips, trace_len) == 0) +		} +		if (bucket && !(flags & BPF_F_REUSE_STACKID)) { +			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); +			return -EEXIST; +		} +	} else { +		if (hash_matches && bucket->nr == trace_nr && +		    memcmp(bucket->data, ips, trace_len) == 0)  			return id; +		if (bucket && !(flags & BPF_F_REUSE_STACKID)) +			return -EEXIST; + +		new_bucket = (struct stack_map_bucket *) +			pcpu_freelist_pop(&smap->freelist); +		if (unlikely(!new_bucket)) +			return -ENOMEM; +		memcpy(new_bucket->data, ips, trace_len);  	} -	/* this call stack is not in the map, try to add it */ -	if (bucket && !(flags & BPF_F_REUSE_STACKID)) -		return -EEXIST; - -	new_bucket = (struct stack_map_bucket *) -		pcpu_freelist_pop(&smap->freelist); -	if (unlikely(!new_bucket)) -		return -ENOMEM; - -	memcpy(new_bucket->ip, ips, trace_len);  	new_bucket->hash = hash;  	new_bucket->nr = trace_nr; @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)  	if (!bucket)  		return -ENOENT; -	trace_len = bucket->nr * sizeof(u64); -	memcpy(value, bucket->ip, trace_len); +	trace_len = bucket->nr * stack_map_data_size(map); +	memcpy(value, bucket->data, trace_len);  	memset(value + trace_len, 0, map->value_size - trace_len);  	old_bucket = xchg(&smap->buckets[id], bucket); | 
