diff options
| author | Thomas Gleixner <tglx@linutronix.de> | 2018-06-22 21:20:35 +0200 | 
|---|---|---|
| committer | Thomas Gleixner <tglx@linutronix.de> | 2018-06-22 21:20:35 +0200 | 
| commit | 7731b8bc94e599c9a79e428f3359ff2c34b7576a (patch) | |
| tree | 879f18ccbe274122f2d4f095b43cbc7f953e0ada /fs/dax.c | |
| parent | 48e315618dc4dc8904182cd221e3d395d5d97005 (diff) | |
| parent | 9ffc59d57228d74809700be6f7ecb1db10292f05 (diff) | |
Merge branch 'linus' into x86/urgent
Required to queue a dependent fix.
Diffstat (limited to 'fs/dax.c')
| -rw-r--r-- | fs/dax.c | 216 | 
1 files changed, 149 insertions, 67 deletions
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,  	}  } +static struct page *dax_busy_page(void *entry) +{ +	unsigned long pfn; + +	for_each_mapped_pfn(entry, pfn) { +		struct page *page = pfn_to_page(pfn); + +		if (page_ref_count(page) > 1) +			return page; +	} +	return NULL; +} +  /*   * Find radix tree entry at given index. If it points to an exceptional entry,   * return it with the radix tree entry locked. If the radix tree doesn't @@ -492,6 +505,90 @@ restart:  	return entry;  } +/** + * dax_layout_busy_page - find first pinned page in @mapping + * @mapping: address space to scan for a page with ref count > 1 + * + * DAX requires ZONE_DEVICE mapped pages. These pages are never + * 'onlined' to the page allocator so they are considered idle when + * page->count == 1. A filesystem uses this interface to determine if + * any page in the mapping is busy, i.e. for DMA, or other + * get_user_pages() usages. + * + * It is expected that the filesystem is holding locks to block the + * establishment of new mappings in this address_space. I.e. it expects + * to be able to run unmap_mapping_range() and subsequently not race + * mapping_mapped() becoming true. + */ +struct page *dax_layout_busy_page(struct address_space *mapping) +{ +	pgoff_t	indices[PAGEVEC_SIZE]; +	struct page *page = NULL; +	struct pagevec pvec; +	pgoff_t	index, end; +	unsigned i; + +	/* +	 * In the 'limited' case get_user_pages() for dax is disabled. +	 */ +	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) +		return NULL; + +	if (!dax_mapping(mapping) || !mapping_mapped(mapping)) +		return NULL; + +	pagevec_init(&pvec); +	index = 0; +	end = -1; + +	/* +	 * If we race get_user_pages_fast() here either we'll see the +	 * elevated page count in the pagevec_lookup and wait, or +	 * get_user_pages_fast() will see that the page it took a reference +	 * against is no longer mapped in the page tables and bail to the +	 * get_user_pages() slow path.  The slow path is protected by +	 * pte_lock() and pmd_lock(). New references are not taken without +	 * holding those locks, and unmap_mapping_range() will not zero the +	 * pte or pmd without holding the respective lock, so we are +	 * guaranteed to either see new references or prevent new +	 * references from being established. +	 */ +	unmap_mapping_range(mapping, 0, 0, 1); + +	while (index < end && pagevec_lookup_entries(&pvec, mapping, index, +				min(end - index, (pgoff_t)PAGEVEC_SIZE), +				indices)) { +		for (i = 0; i < pagevec_count(&pvec); i++) { +			struct page *pvec_ent = pvec.pages[i]; +			void *entry; + +			index = indices[i]; +			if (index >= end) +				break; + +			if (!radix_tree_exceptional_entry(pvec_ent)) +				continue; + +			xa_lock_irq(&mapping->i_pages); +			entry = get_unlocked_mapping_entry(mapping, index, NULL); +			if (entry) +				page = dax_busy_page(entry); +			put_unlocked_mapping_entry(mapping, index, entry); +			xa_unlock_irq(&mapping->i_pages); +			if (page) +				break; +		} +		pagevec_remove_exceptionals(&pvec); +		pagevec_release(&pvec); +		index++; + +		if (page) +			break; +	} +	return page; +} +EXPORT_SYMBOL_GPL(dax_layout_busy_page); +  static int __dax_invalidate_mapping_entry(struct address_space *mapping,  					  pgoff_t index, bool trunc)  { @@ -677,7 +774,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,  		 * downgrading page table protection not changing it to point  		 * to a new page.  		 * -		 * See Documentation/vm/mmu_notifier.txt +		 * See Documentation/vm/mmu_notifier.rst  		 */  		if (pmdp) {  #ifdef CONFIG_FS_DAX_PMD @@ -905,14 +1002,13 @@ out:   * If this page is ever written to we will re-fault and change the mapping to   * point to real DAX storage instead.   */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,  			 struct vm_fault *vmf)  {  	struct inode *inode = mapping->host;  	unsigned long vaddr = vmf->address; -	int ret = VM_FAULT_NOPAGE; +	vm_fault_t ret = VM_FAULT_NOPAGE;  	struct page *zero_page; -	void *entry2;  	pfn_t pfn;  	zero_page = ZERO_PAGE(0); @@ -922,14 +1018,9 @@ static int dax_load_hole(struct address_space *mapping, void *entry,  	}  	pfn = page_to_pfn_t(zero_page); -	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, -			RADIX_DAX_ZERO_PAGE, false); -	if (IS_ERR(entry2)) { -		ret = VM_FAULT_SIGBUS; -		goto out; -	} - -	vm_insert_mixed(vmf->vma, vaddr, pfn); +	dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, +			false); +	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);  out:  	trace_dax_load_hole(inode, vmf, ret);  	return ret; @@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,  	struct iov_iter *iter = data;  	loff_t end = pos + length, done = 0;  	ssize_t ret = 0; +	size_t xfer;  	int id;  	if (iov_iter_rw(iter) == READ) { @@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,  		 * vfs_write(), depending on which operation we are doing.  		 */  		if (iov_iter_rw(iter) == WRITE) -			map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, +			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,  					map_len, iter);  		else -			map_len = copy_to_iter(kaddr, map_len, iter); -		if (map_len <= 0) { -			ret = map_len ? map_len : -EFAULT; -			break; -		} +			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, +					map_len, iter); + +		pos += xfer; +		length -= xfer; +		done += xfer; -		pos += map_len; -		length -= map_len; -		done += map_len; +		if (xfer == 0) +			ret = -EFAULT; +		if (xfer < map_len) +			break;  	}  	dax_read_unlock(id); @@ -1112,7 +1206,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,  }  EXPORT_SYMBOL_GPL(dax_iomap_rw); -static int dax_fault_return(int error) +static vm_fault_t dax_fault_return(int error)  {  	if (error == 0)  		return VM_FAULT_NOPAGE; @@ -1132,7 +1226,7 @@ static bool dax_fault_is_synchronous(unsigned long flags,  		&& (iomap->flags & IOMAP_F_DIRTY);  } -static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  			       int *iomap_errp, const struct iomap_ops *ops)  {  	struct vm_area_struct *vma = vmf->vma; @@ -1145,18 +1239,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  	int error, major = 0;  	bool write = vmf->flags & FAULT_FLAG_WRITE;  	bool sync; -	int vmf_ret = 0; +	vm_fault_t ret = 0;  	void *entry;  	pfn_t pfn; -	trace_dax_pte_fault(inode, vmf, vmf_ret); +	trace_dax_pte_fault(inode, vmf, ret);  	/*  	 * Check whether offset isn't beyond end of file now. Caller is supposed  	 * to hold locks serializing us with truncate / punch hole so this is  	 * a reliable test.  	 */  	if (pos >= i_size_read(inode)) { -		vmf_ret = VM_FAULT_SIGBUS; +		ret = VM_FAULT_SIGBUS;  		goto out;  	} @@ -1165,7 +1259,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);  	if (IS_ERR(entry)) { -		vmf_ret = dax_fault_return(PTR_ERR(entry)); +		ret = dax_fault_return(PTR_ERR(entry));  		goto out;  	} @@ -1176,7 +1270,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  	 * retried.  	 */  	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { -		vmf_ret = VM_FAULT_NOPAGE; +		ret = VM_FAULT_NOPAGE;  		goto unlock_entry;  	} @@ -1189,7 +1283,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  	if (iomap_errp)  		*iomap_errp = error;  	if (error) { -		vmf_ret = dax_fault_return(error); +		ret = dax_fault_return(error);  		goto unlock_entry;  	}  	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { @@ -1219,9 +1313,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  			goto error_finish_iomap;  		__SetPageUptodate(vmf->cow_page); -		vmf_ret = finish_fault(vmf); -		if (!vmf_ret) -			vmf_ret = VM_FAULT_DONE_COW; +		ret = finish_fault(vmf); +		if (!ret) +			ret = VM_FAULT_DONE_COW;  		goto finish_iomap;  	} @@ -1240,10 +1334,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,  						 0, write && !sync); -		if (IS_ERR(entry)) { -			error = PTR_ERR(entry); -			goto error_finish_iomap; -		}  		/*  		 * If we are doing synchronous page fault and inode needs fsync, @@ -1257,23 +1347,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  				goto error_finish_iomap;  			}  			*pfnp = pfn; -			vmf_ret = VM_FAULT_NEEDDSYNC | major; +			ret = VM_FAULT_NEEDDSYNC | major;  			goto finish_iomap;  		}  		trace_dax_insert_mapping(inode, vmf, entry);  		if (write) -			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); +			ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);  		else -			error = vm_insert_mixed(vma, vaddr, pfn); +			ret = vmf_insert_mixed(vma, vaddr, pfn); -		/* -EBUSY is fine, somebody else faulted on the same PTE */ -		if (error == -EBUSY) -			error = 0; -		break; +		goto finish_iomap;  	case IOMAP_UNWRITTEN:  	case IOMAP_HOLE:  		if (!write) { -			vmf_ret = dax_load_hole(mapping, entry, vmf); +			ret = dax_load_hole(mapping, entry, vmf);  			goto finish_iomap;  		}  		/*FALLTHRU*/ @@ -1284,12 +1371,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,  	}   error_finish_iomap: -	vmf_ret = dax_fault_return(error) | major; +	ret = dax_fault_return(error);   finish_iomap:  	if (ops->iomap_end) {  		int copied = PAGE_SIZE; -		if (vmf_ret & VM_FAULT_ERROR) +		if (ret & VM_FAULT_ERROR)  			copied = 0;  		/*  		 * The fault is done by now and there's no way back (other @@ -1302,12 +1389,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,   unlock_entry:  	put_locked_mapping_entry(mapping, vmf->pgoff);   out: -	trace_dax_pte_fault_done(inode, vmf, vmf_ret); -	return vmf_ret; +	trace_dax_pte_fault_done(inode, vmf, ret); +	return ret | major;  }  #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, +static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,  		void *entry)  {  	struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,  	pfn = page_to_pfn_t(zero_page);  	ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,  			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); -	if (IS_ERR(ret)) -		goto fallback;  	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);  	if (!pmd_none(*(vmf->pmd))) { @@ -1348,7 +1433,7 @@ fallback:  	return VM_FAULT_FALLBACK;  } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,  			       const struct iomap_ops *ops)  {  	struct vm_area_struct *vma = vmf->vma; @@ -1358,7 +1443,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,  	bool sync;  	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;  	struct inode *inode = mapping->host; -	int result = VM_FAULT_FALLBACK; +	vm_fault_t result = VM_FAULT_FALLBACK;  	struct iomap iomap = { 0 };  	pgoff_t max_pgoff, pgoff;  	void *entry; @@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,  		entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,  						RADIX_DAX_PMD, write && !sync); -		if (IS_ERR(entry)) -			goto finish_iomap;  		/*  		 * If we are doing synchronous page fault and inode needs fsync, @@ -1509,7 +1592,7 @@ out:  	return result;  }  #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,  			       const struct iomap_ops *ops)  {  	return VM_FAULT_FALLBACK; @@ -1529,7 +1612,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,   * has done all the necessary locking for page fault to proceed   * successfully.   */ -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,  		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)  {  	switch (pe_size) { @@ -1553,14 +1636,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);   * DAX file.  It takes care of marking corresponding radix tree entry as dirty   * as well.   */ -static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,  				  enum page_entry_size pe_size,  				  pfn_t pfn)  {  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;  	void *entry, **slot;  	pgoff_t index = vmf->pgoff; -	int vmf_ret, error; +	vm_fault_t ret;  	xa_lock_irq(&mapping->i_pages);  	entry = get_unlocked_mapping_entry(mapping, index, &slot); @@ -1579,21 +1662,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,  	xa_unlock_irq(&mapping->i_pages);  	switch (pe_size) {  	case PE_SIZE_PTE: -		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); -		vmf_ret = dax_fault_return(error); +		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);  		break;  #ifdef CONFIG_FS_DAX_PMD  	case PE_SIZE_PMD: -		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, +		ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,  			pfn, true);  		break;  #endif  	default: -		vmf_ret = VM_FAULT_FALLBACK; +		ret = VM_FAULT_FALLBACK;  	}  	put_locked_mapping_entry(mapping, index); -	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); -	return vmf_ret; +	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); +	return ret;  }  /** @@ -1606,8 +1688,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,   * stored persistently on the media and handles inserting of appropriate page   * table entry.   */ -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, -			  pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, +		enum page_entry_size pe_size, pfn_t pfn)  {  	int err;  	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;  | 
