diff options
| -rw-r--r-- | arch/powerpc/include/asm/powernv.h | 2 | ||||
| -rw-r--r-- | arch/powerpc/kernel/mce_power.c | 7 | ||||
| -rw-r--r-- | arch/powerpc/kernel/smp.c | 49 | ||||
| -rw-r--r-- | arch/powerpc/kvm/booke.c | 7 | ||||
| -rw-r--r-- | arch/powerpc/mm/mem.c | 2 | ||||
| -rw-r--r-- | arch/powerpc/platforms/powernv/memtrace.c | 17 | ||||
| -rw-r--r-- | arch/powerpc/platforms/powernv/npu-dma.c | 88 | ||||
| -rw-r--r-- | arch/powerpc/platforms/powernv/opal-rtc.c | 8 | ||||
| -rw-r--r-- | drivers/cpufreq/powernv-cpufreq.c | 14 | ||||
| -rw-r--r-- | drivers/rtc/rtc-opal.c | 37 | 
10 files changed, 166 insertions, 65 deletions
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index d1c2d2e658cf..2f3ff7a27881 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -15,7 +15,7 @@  extern void powernv_set_nmmu_ptcr(unsigned long ptcr);  extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,  			unsigned long flags, -			struct npu_context *(*cb)(struct npu_context *, void *), +			void (*cb)(struct npu_context *, void *),  			void *priv);  extern void pnv_npu2_destroy_context(struct npu_context *context,  				struct pci_dev *gpdev); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index fe6fc63251fe..38c5b4764bfe 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -441,7 +441,6 @@ static int mce_handle_ierror(struct pt_regs *regs,  					if (pfn != ULONG_MAX) {  						*phys_addr =  							(pfn << PAGE_SHIFT); -						handled = 1;  					}  				}  			} @@ -532,9 +531,7 @@ static int mce_handle_derror(struct pt_regs *regs,  			 * kernel/exception-64s.h  			 */  			if (get_paca()->in_mce < MAX_MCE_DEPTH) -				if (!mce_find_instr_ea_and_pfn(regs, addr, -								phys_addr)) -					handled = 1; +				mce_find_instr_ea_and_pfn(regs, addr, phys_addr);  		}  		found = 1;  	} @@ -572,7 +569,7 @@ static long mce_handle_error(struct pt_regs *regs,  		const struct mce_ierror_table itable[])  {  	struct mce_error_info mce_err = { 0 }; -	uint64_t addr, phys_addr; +	uint64_t addr, phys_addr = ULONG_MAX;  	uint64_t srr1 = regs->msr;  	long handled; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e16ec7b3b427..9ca7148b5881 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -566,10 +566,35 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))  #endif  #ifdef CONFIG_NMI_IPI -static void stop_this_cpu(struct pt_regs *regs) -#else +static void nmi_stop_this_cpu(struct pt_regs *regs) +{ +	/* +	 * This is a special case because it never returns, so the NMI IPI +	 * handling would never mark it as done, which makes any later +	 * smp_send_nmi_ipi() call spin forever. Mark it done now. +	 * +	 * IRQs are already hard disabled by the smp_handle_nmi_ipi. +	 */ +	nmi_ipi_lock(); +	nmi_ipi_busy_count--; +	nmi_ipi_unlock(); + +	/* Remove this CPU */ +	set_cpu_online(smp_processor_id(), false); + +	spin_begin(); +	while (1) +		spin_cpu_relax(); +} + +void smp_send_stop(void) +{ +	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, nmi_stop_this_cpu, 1000000); +} + +#else /* CONFIG_NMI_IPI */ +  static void stop_this_cpu(void *dummy) -#endif  {  	/* Remove this CPU */  	set_cpu_online(smp_processor_id(), false); @@ -582,12 +607,22 @@ static void stop_this_cpu(void *dummy)  void smp_send_stop(void)  { -#ifdef CONFIG_NMI_IPI -	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, stop_this_cpu, 1000000); -#else +	static bool stopped = false; + +	/* +	 * Prevent waiting on csd lock from a previous smp_send_stop. +	 * This is racy, but in general callers try to do the right +	 * thing and only fire off one smp_send_stop (e.g., see +	 * kernel/panic.c) +	 */ +	if (stopped) +		return; + +	stopped = true; +  	smp_call_function(stop_this_cpu, NULL, 0); -#endif  } +#endif /* CONFIG_NMI_IPI */  struct thread_info *current_set[NR_CPUS]; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 6038e2e7aee0..876d4f294fdd 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -305,6 +305,13 @@ void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)  	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);  } +#ifdef CONFIG_ALTIVEC +void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu) +{ +	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL); +} +#endif +  void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)  {  	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 737f8a4632cc..c3c39b02b2ba 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -133,6 +133,7 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *  			start, start + size, rc);  		return -EFAULT;  	} +	flush_inval_dcache_range(start, start + size);  	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);  } @@ -159,6 +160,7 @@ int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap  	/* Remove htab bolted mappings for this section of memory */  	start = (unsigned long)__va(start); +	flush_inval_dcache_range(start, start + size);  	ret = remove_section_mapping(start, start + size);  	/* Ensure all vmalloc mappings are flushed in case they also diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index de470caf0784..fc222a0c2ac4 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -82,19 +82,6 @@ static const struct file_operations memtrace_fops = {  	.open	= simple_open,  }; -static void flush_memory_region(u64 base, u64 size) -{ -	unsigned long line_size = ppc64_caches.l1d.size; -	u64 end = base + size; -	u64 addr; - -	base = round_down(base, line_size); -	end = round_up(end, line_size); - -	for (addr = base; addr < end; addr += line_size) -		asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory"); -} -  static int check_memblock_online(struct memory_block *mem, void *arg)  {  	if (mem->state != MEM_ONLINE) @@ -132,10 +119,6 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)  	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,  			  change_memblock_state); -	/* RCU grace period? */ -	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), -			    nr_pages << PAGE_SHIFT); -  	lock_device_hotplug();  	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);  	unlock_device_hotplug(); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 69a4f9e8bd55..525e966dce34 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -34,6 +34,19 @@  #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)  /* + * spinlock to protect initialisation of an npu_context for a particular + * mm_struct. + */ +static DEFINE_SPINLOCK(npu_context_lock); + +/* + * When an address shootdown range exceeds this threshold we invalidate the + * entire TLB on the GPU for the given PID rather than each specific address in + * the range. + */ +#define ATSD_THRESHOLD (2*1024*1024) + +/*   * Other types of TCE cache invalidation are not functional in the   * hardware.   */ @@ -401,7 +414,7 @@ struct npu_context {  	bool nmmu_flush;  	/* Callback to stop translation requests on a given GPU */ -	struct npu_context *(*release_cb)(struct npu_context *, void *); +	void (*release_cb)(struct npu_context *context, void *priv);  	/*  	 * Private pointer passed to the above callback for usage by @@ -671,11 +684,19 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,  	struct npu_context *npu_context = mn_to_npu_context(mn);  	unsigned long address; -	for (address = start; address < end; address += PAGE_SIZE) -		mmio_invalidate(npu_context, 1, address, false); +	if (end - start > ATSD_THRESHOLD) { +		/* +		 * Just invalidate the entire PID if the address range is too +		 * large. +		 */ +		mmio_invalidate(npu_context, 0, 0, true); +	} else { +		for (address = start; address < end; address += PAGE_SIZE) +			mmio_invalidate(npu_context, 1, address, false); -	/* Do the flush only on the final addess == end */ -	mmio_invalidate(npu_context, 1, address, true); +		/* Do the flush only on the final addess == end */ +		mmio_invalidate(npu_context, 1, address, true); +	}  }  static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { @@ -696,11 +717,12 @@ static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {   * Returns an error if there no contexts are currently available or a   * npu_context which should be passed to pnv_npu2_handle_fault().   * - * mmap_sem must be held in write mode. + * mmap_sem must be held in write mode and must not be called from interrupt + * context.   */  struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,  			unsigned long flags, -			struct npu_context *(*cb)(struct npu_context *, void *), +			void (*cb)(struct npu_context *, void *),  			void *priv)  {  	int rc; @@ -743,7 +765,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,  	/*  	 * Setup the NPU context table for a particular GPU. These need to be  	 * per-GPU as we need the tables to filter ATSDs when there are no -	 * active contexts on a particular GPU. +	 * active contexts on a particular GPU. It is safe for these to be +	 * called concurrently with destroy as the OPAL call takes appropriate +	 * locks and refcounts on init/destroy.  	 */  	rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,  				PCI_DEVID(gpdev->bus->number, gpdev->devfn)); @@ -754,8 +778,29 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,  	 * We store the npu pci device so we can more easily get at the  	 * associated npus.  	 */ +	spin_lock(&npu_context_lock);  	npu_context = mm->context.npu_context; +	if (npu_context) { +		if (npu_context->release_cb != cb || +			npu_context->priv != priv) { +			spin_unlock(&npu_context_lock); +			opal_npu_destroy_context(nphb->opal_id, mm->context.id, +						PCI_DEVID(gpdev->bus->number, +							gpdev->devfn)); +			return ERR_PTR(-EINVAL); +		} + +		WARN_ON(!kref_get_unless_zero(&npu_context->kref)); +	} +	spin_unlock(&npu_context_lock); +  	if (!npu_context) { +		/* +		 * We can set up these fields without holding the +		 * npu_context_lock as the npu_context hasn't been returned to +		 * the caller meaning it can't be destroyed. Parallel allocation +		 * is protected against by mmap_sem. +		 */  		rc = -ENOMEM;  		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);  		if (npu_context) { @@ -774,8 +819,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,  		}  		mm->context.npu_context = npu_context; -	} else { -		WARN_ON(!kref_get_unless_zero(&npu_context->kref));  	}  	npu_context->release_cb = cb; @@ -814,15 +857,16 @@ static void pnv_npu2_release_context(struct kref *kref)  		mm_context_remove_copro(npu_context->mm);  	npu_context->mm->context.npu_context = NULL; -	mmu_notifier_unregister(&npu_context->mn, -				npu_context->mm); - -	kfree(npu_context);  } +/* + * Destroy a context on the given GPU. May free the npu_context if it is no + * longer active on any GPUs. Must not be called from interrupt context. + */  void pnv_npu2_destroy_context(struct npu_context *npu_context,  			struct pci_dev *gpdev)  { +	int removed;  	struct pnv_phb *nphb;  	struct npu *npu;  	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); @@ -844,7 +888,21 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,  	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);  	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,  				PCI_DEVID(gpdev->bus->number, gpdev->devfn)); -	kref_put(&npu_context->kref, pnv_npu2_release_context); +	spin_lock(&npu_context_lock); +	removed = kref_put(&npu_context->kref, pnv_npu2_release_context); +	spin_unlock(&npu_context_lock); + +	/* +	 * We need to do this outside of pnv_npu2_release_context so that it is +	 * outside the spinlock as mmu_notifier_destroy uses SRCU. +	 */ +	if (removed) { +		mmu_notifier_unregister(&npu_context->mn, +					npu_context->mm); + +		kfree(npu_context); +	} +  }  EXPORT_SYMBOL(pnv_npu2_destroy_context); diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index f8868864f373..aa2a5139462e 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -48,10 +48,12 @@ unsigned long __init opal_get_boot_time(void)  	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {  		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); -		if (rc == OPAL_BUSY_EVENT) +		if (rc == OPAL_BUSY_EVENT) { +			mdelay(OPAL_BUSY_DELAY_MS);  			opal_poll_events(NULL); -		else if (rc == OPAL_BUSY) -			mdelay(10); +		} else if (rc == OPAL_BUSY) { +			mdelay(OPAL_BUSY_DELAY_MS); +		}  	}  	if (rc != OPAL_SUCCESS)  		return 0; diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 0591874856d3..54edaec1e608 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -679,6 +679,16 @@ void gpstate_timer_handler(struct timer_list *t)  	if (!spin_trylock(&gpstates->gpstate_lock))  		return; +	/* +	 * If the timer has migrated to the different cpu then bring +	 * it back to one of the policy->cpus +	 */ +	if (!cpumask_test_cpu(raw_smp_processor_id(), policy->cpus)) { +		gpstates->timer.expires = jiffies + msecs_to_jiffies(1); +		add_timer_on(&gpstates->timer, cpumask_first(policy->cpus)); +		spin_unlock(&gpstates->gpstate_lock); +		return; +	}  	/*  	 * If PMCR was last updated was using fast_swtich then @@ -718,10 +728,8 @@ void gpstate_timer_handler(struct timer_list *t)  	if (gpstate_idx != gpstates->last_lpstate_idx)  		queue_gpstate_timer(gpstates); +	set_pstate(&freq_data);  	spin_unlock(&gpstates->gpstate_lock); - -	/* Timer may get migrated to a different cpu on cpu hot unplug */ -	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);  }  /* diff --git a/drivers/rtc/rtc-opal.c b/drivers/rtc/rtc-opal.c index 304e891e35fc..60f2250fd96b 100644 --- a/drivers/rtc/rtc-opal.c +++ b/drivers/rtc/rtc-opal.c @@ -57,7 +57,7 @@ static void tm_to_opal(struct rtc_time *tm, u32 *y_m_d, u64 *h_m_s_ms)  static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)  { -	long rc = OPAL_BUSY; +	s64 rc = OPAL_BUSY;  	int retries = 10;  	u32 y_m_d;  	u64 h_m_s_ms; @@ -66,13 +66,17 @@ static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)  	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {  		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); -		if (rc == OPAL_BUSY_EVENT) +		if (rc == OPAL_BUSY_EVENT) { +			msleep(OPAL_BUSY_DELAY_MS);  			opal_poll_events(NULL); -		else if (retries-- && (rc == OPAL_HARDWARE -				       || rc == OPAL_INTERNAL_ERROR)) -			msleep(10); -		else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT) -			break; +		} else if (rc == OPAL_BUSY) { +			msleep(OPAL_BUSY_DELAY_MS); +		} else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) { +			if (retries--) { +				msleep(10); /* Wait 10ms before retry */ +				rc = OPAL_BUSY; /* go around again */ +			} +		}  	}  	if (rc != OPAL_SUCCESS) @@ -87,21 +91,26 @@ static int opal_get_rtc_time(struct device *dev, struct rtc_time *tm)  static int opal_set_rtc_time(struct device *dev, struct rtc_time *tm)  { -	long rc = OPAL_BUSY; +	s64 rc = OPAL_BUSY;  	int retries = 10;  	u32 y_m_d = 0;  	u64 h_m_s_ms = 0;  	tm_to_opal(tm, &y_m_d, &h_m_s_ms); +  	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {  		rc = opal_rtc_write(y_m_d, h_m_s_ms); -		if (rc == OPAL_BUSY_EVENT) +		if (rc == OPAL_BUSY_EVENT) { +			msleep(OPAL_BUSY_DELAY_MS);  			opal_poll_events(NULL); -		else if (retries-- && (rc == OPAL_HARDWARE -				       || rc == OPAL_INTERNAL_ERROR)) -			msleep(10); -		else if (rc != OPAL_BUSY && rc != OPAL_BUSY_EVENT) -			break; +		} else if (rc == OPAL_BUSY) { +			msleep(OPAL_BUSY_DELAY_MS); +		} else if (rc == OPAL_HARDWARE || rc == OPAL_INTERNAL_ERROR) { +			if (retries--) { +				msleep(10); /* Wait 10ms before retry */ +				rc = OPAL_BUSY; /* go around again */ +			} +		}  	}  	return rc == OPAL_SUCCESS ? 0 : -EIO;  | 
